def test_headings(self): assert clean("==foo==") == "foo" assert clean("===Foo===") == "Foo" assert clean("=== Foo ===") == "Foo" assert ( clean("== Brot úr søguni hjá Klaksvíkar kommunu ==") == "Brot úr søguni hjá Klaksvíkar kommunu" )
def test_complex(self): assert clean("=== Á [[Borðoy|Borðoynni]] ===") == "Á Borðoynni" assert ( clean( "''' Klaksvíkar kommuna''' er næststørsta kommuna í [[Føroyar|Føroyum]]." ) == "Klaksvíkar kommuna er næststørsta kommuna í Føroyum." ) assert ( clean( "'''Fugloy''', sum hevur fingið navn av tí nógva [[Fuglur|fugli]], " "ið har búleikast, er tann minsta av [[Norðoyar|Norðoyum]]" ) == "Fugloy, sum hevur fingið navn av tí nógva fugli, ið har búleikast, er tann minsta av Norðoyum" ) assert ( clean( """ foo{{Infobox cyclist | birth_date = {{birth date and age|1987|7|5|df=yes}} | height = {{convert|1,81|m|ftin|abbr=on}} | weight = {{convert|78|kg|lb|abbr=on}} }}bar """.strip() ) == "foo bar" ) assert ( clean( """ {{Infobox person | name = Wes Craven | image = Wes Craven 2010.jpg | image_size = | caption = Wes Craven í 2010 | birth_name = Wesley Earl Craven | birth_date = {{birth date|1939|08|02|df=yes}} | birth_place = Cleveland, [[Ohio]], [[USA]] | death_date = {{death date and age|2015|08|30|1939|08|02|df=yes}} | death_place = [[Los Angeles]], [[California]], [[USA]] | death_cause = Heilakrabbi | occupation = Filmsleikstjóri<br />Rithøvundur<br />Framleiðari<br />Sjónleikari | years_active = 1971–2015 | spouse = {{marriage|Bonnie Broecker|1964|1969|reason=skild}}<br />{{marriage|[[Mimi Craven]]|1984|1987|reason=skild}}<br />{{marriage|Iya Labunka|2004|2015|reason=deyða sín}} | website = {{URL|http://www.wescraven.com}} | children = 2, harímillum Jonathan Craven }} """.strip() ) == "" )
def test_remove_external_links(self): assert ( clean( "Facebook - https://www.facebook.com/fridikarlssonjustesen?fb_dtsg_ag=" "AdziAVqHQ8WsgyLnRRbFgiD48LLV3ZblI2r6ejYM1ymo-g%3AAdxJZEj_FFlDDrUGSnYE" "vseo2SqjBZo3wWNKBausddsffQ link" ) == "Facebook - link" ) assert ( clean("http://www.klaksvik.fo Heimasíðan hjá Klaksvíkar kommunu") == "Heimasíðan hjá Klaksvíkar kommunu" )
def get_frequencies(wikia_name, limit=500, min_length=3): """ :type wikia_name str|None :type limit int :type min_length int :rtype: Counter """ logger = logging.getLogger('get_frequencies') dump = WikiaDump(wiki=wikia_name) # read the dump stats = Counter() reader = DumpReaderArticles() pages = reader.read(dump) for page in pages: # get stopwords stop_words = get_stop_words(reader.get_dump_language()) tokens = tokenize(clean(page.content.lower())) tokens = [ token for token in tokens if len(token) >= min_length and token not in stop_words and not token.startswith('#') ] stats.update(tokens) logger.info('Tokens counted: %d' % len(stats)) return stats.most_common(limit)
def test_from_file(self): # https://fo.wikipedia.org/wiki/Klaksv%C3%ADkar_kommuna with open("test/fixtures/text.txt", "rt") as f: text = f.read().strip() with open("test/fixtures/expected.txt", "rt") as f: expected = f.read().strip() assert clean(text) == expected
def test_links(self): assert clean("[[foo]] bar") == "foo bar" assert clean("[[foo]]s bar") == "foos bar" assert clean("av [[Norðoyar|Norðuroyggjum]].") == "av Norðuroyggjum." assert clean("* [[Svínoy]]") == "Svínoy" assert clean("[[File:Kommunur í Føroyum]] foo") == "foo" assert clean("[[Bólkur:Kommunur í Føroyum]] foo") == "foo" assert ( clean("[[bar]] test [[Bólkur:Kommunur í Føroyum]] foo") == "bar test foo" )
def words_from_dump(wiki): """ :type wiki str """ logger = logging.getLogger('words_from_dump') logger.info('Processing dump of "%s" wiki...', wiki) dump = WikipediaDump(wiki) pages = DumpReaderArticles().read(dump) long_words = [] # pages = list(pages)[:50] # debug, take only first X pages for page in pages: content = page.content if str(content).startswith('#REDIRECT'): logger.debug('%s is a redirect, skipping...', page.title) continue article_words = tokenize(clean(page.title + ' ' + content)) # make it lower article_words = [str(word).lower() for word in article_words] # make it unique and sort it article_words = sorted(set(article_words)) # add long words (and filter out words with X) words_from_article = [ word for word in article_words if len(word) > 10 and 'x' not in word ] if 'filmsleikstjóririthøvundurframleiðarisjónleikari' in words_from_article: logger.info('Word found in %s', page.title) print(content) long_words += words_from_article # print('---') # print(title, content, article_words) # sort long words long_words = sorted(set(long_words), key=len, reverse=True) # show top X for i, word in enumerate(long_words[:50]): print('%d %s - %d' % (i + 1, word, len(word)))
def test_templates(self): assert clean("{{Kommunur}}") == "" assert clean("{{Kommunur}} bar {{test}}") == "bar" assert clean("{{Kommunur}}bar{{test}}test") == "bar test" # space is kept assert clean("{{Kommunur|foo|bar}}") == "" assert clean("{{Kommunur|{{foo}}}}") == "" assert clean("{{Kommunur|{{foo}}|test}}") == "" assert ( clean("{{Kommunur|{{foo}}|test") == "{{Kommunur|{{foo}}|test" ) # unbalanced template wikitext assert ( clean( "[[Theodor W. Adorno|Adorno]]{{·}}[[Roland Barthes|Barthes]]" "{{·}}[[Jean Baudrillard|Baudrillard]]{{·}}[[Georges Bataille|Bataille]]" ) == "Adorno Barthes Baudrillard Bataille" )
def test_tables(self): assert ( clean( """ foo {| class="wikitable" |- ! ! Útflutningur ! Innflutningur |- | 9. | {{bar}} | test |- |} bar """.strip() ) == "foo\n\nbar" )
def test_magic_words(self): assert clean("foo__NOWYSIWYG__bar") == "foobar" assert clean("foo\n\n__TOC__\nbar") == "foo\n\n\nbar" assert clean("foo __ foo __ bar") == "foo __ foo __ bar"
def test_html(self): assert clean("foo bar") == "foo bar" assert clean("foo<br>") == "foo" assert clean("foo<br />") == "foo" assert clean("foo<br />bar") == "foo bar"
def test_parser_hooks(self): assert clean("foo<ref>link</ref>") == "foo" assert clean("E = mc<sup>2</sup>") == "E = mc"
def test_basic(self): assert clean("Foo bar.") == "Foo bar." assert clean("foo bar") == "foo bar" assert clean("''italic''") == "italic" assert clean("'''bold'''") == "bold"
def test_external_links(self): assert ( clean("[http://www.klaksvik.fo Heimasíðan hjá Klaksvíkar kommunu]") == "Heimasíðan hjá Klaksvíkar kommunu" )
def test_lists(self): assert clean("* 123\n*245\n* 346 * 789") == "123\n245\n346 * 789" assert clean("* 123\n** 245") == "123\n245"