Пример #1
0
    def __init__(self, site_url, html):
        language_table = stats_lib.set_up_character_ranges_table()

        tags = gt.get_tags(html)
        self.title = gt.get_title( tags )
        self.outgoing_links = gt.get_links(site_url, tags)
        self.outgoing_link_count = len( self.outgoing_links )
        self.scripts = gt.get_scripts(tags)
        self.number_of_scripts = len(self.scripts)
        self.langs = dict()
        self.alphabets_on_site = []
        self.incoming_link_count = 0
        text = tags.getText()
        
        #print( text)
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch ,language_table )
            #print( ch , lang )
            if lang in self.langs:
                self.langs[lang] += 1
            else:
                self.langs[lang] = 1

        for key in self.langs:
            if self.langs[key] > len(text)/100 and key not in self.alphabets_on_site:
                self.alphabets_on_site.append(key)
Пример #2
0
def all_alphabets(robot):
    import stats_lib
    language_table = stats_lib.set_up_character_ranges_table()
    cursor = robot.get_data_from_table('fish_sites', 'url,body_text')
    alphabets = dict()
    for row in cursor:
        text = row[1]
        url = row[0]
        langs = dict()
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch, language_table )
            #print( ch , lang )
            if lang in langs:
                langs[lang] += 1
            else:
                langs[lang] = 1

        for key in langs:
            if langs[key] > len(text)/50.0:
                if key not in alphabets:
                    alphabets[key] = 1
                else:
                    alphabets[key] += 1

    robot.close_connection()
    return alphabets
Пример #3
0
    def __init__(self, site_url, html):
        language_table = stats_lib.set_up_character_ranges_table()

        tags = gt.get_tags(html)
        self.url = site_url
        self.title = gt.get_title( tags )
        self.outgoing_links = gt.get_links(site_url, tags)
        self.outgoing_link_count = len( self.outgoing_links )
        self.scripts = gt.get_scripts(tags)
        self.number_of_scripts = len(self.scripts)
        self.langs = dict()
        self.hash = make_hash(site_url)
        self.alphabets_on_site = []
        for script in tags(['script', 'style']):
            script.extract()
        self.body = strip_non_space_whitespace( tags.getText() )
        text = self.body
        
        #print( text)
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch, language_table )
            #print( ch , lang )
            if lang in self.langs:
                self.langs[lang] += 1
            else:
                self.langs[lang] = 1

        for key in self.langs:
            if self.langs[key] > len(text)/70 and key not in self.alphabets_on_site:
                self.alphabets_on_site.append(key)

        self.n_grams = stats_lib.count_n_grams( self.body, 5 )
        self.symbol_freq = stats_lib.count_symbol_frequency( self.body )
        self.symbol_entropy = stats_lib.calculate_symbol_entropy( self.symbol_freq )
        self.raw_html = html