示例#1
0
def test_json_string_parser_extraction():
    parser = EscapedJSONStringParser()

    # Load a dummy escaped json-string file.
    raw = AuxiliaryFile(_get_resource_path('dummy.jsonstring.txt'))
    parser.prepare(raw)

    # Extract documents from the parser.
    documents = list(parser.extract(raw))

    # The dummy dump file contains 3 full articles and others are empty.
    assert len(documents) == 3
示例#2
0
 def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
           ) -> AuxiliaryFile:
     # Note that an imported files would be wrapped with `AuxiliaryFile`
     # directly. Because the files are not created by `AuxiliaryFileManager`
     # but brought simply from existing external files, they do not need to
     # be removed. Namely, the manager does not have the ownership of them.
     files = []
     for path in self.paths:
         print(colorful.render(f'<r>[*]</r> import file from '
                               f'<b>{path}</b>'))
         files.append(AuxiliaryFile(path))
     return tuple(files)
示例#3
0
def test_wikipedia_parser_extraction():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Extract documents from the parser.
    documents = list(parser.extract(raw))

    # The dummy dump file contains 3 full articles and other redirection pages.
    assert len(documents) == 3
示例#4
0
def test_wikipedia_parser_preparation():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Check if the parser extracts the namespaces in wikipedia correctly.
    assert (parser.namespaces
            == ['Media', 'Special', 'Talk', 'User', 'User talk', 'Wikipedia',
                'Wikipedia talk', 'File', 'File talk', 'MediaWiki',
                'MediaWiki talk', 'Template', 'Template talk', 'Help',
                'Help talk', 'Category', 'Category talk', 'Portal',
                'Portal talk', 'Book', 'Book talk', 'Draft', 'Draft talk',
                'Education Program', 'Education Program talk', 'TimedText',
                'TimedText talk', 'Module', 'Module talk', 'Gadget',
                'Gadget talk', 'Gadget definition', 'Gadget definition talk'])
示例#5
0
def test_if_parser_parses_mediawiki_codes_well():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Extract documents and parse the mediawiki codes.
    articles = []
    for document in parser.extract(raw):
        article = parser.parse(document)
        if article:
            articles.append(article)

    assert (articles == ['Archer is a slab serif typeface designed in 2001 by '
                         'Tobias Frere-Jones and Jonathan Hoefler for use in '
                         'Martha Stewart Living magazine. It was later '
                         'released by Hoefler & Frere-Jones for commercial '
                         'licensing.\n'
                         'The typeface is a geometric slab serif, one with a '
                         'geometric design similar to sans-serif fonts. It '
                         'takes inspiration from mid-twentieth century '
                         'designs such as Rockwell.\n'
                         'The face is unique for combining the geometric '
                         'structure of twentieth-century European slab-serifs '
                         'but imbuing the face with a domestic, less strident '
                         'tone of voice. Balls were added to the upper '
                         'terminals on letters such as C and G to increase '
                         'its charm. Italics are true italic designs, with '
                         'flourishes influenced by calligraphy, an unusual '
                         'feature for geometric slab serif designs. As with '
                         'many Hoefler & Frere-Jones designs, it was released '
                         'in a wide range of weights from hairline to bold, '
                         'reflecting its design goal as a typeface for '
                         'complex magazines.\n'
                         'The typeface has been used for, among other things, '
                         'branding for Wells Fargo and is a main font for the '
                         'San Francisco Chronicle and Wes Anderson\'s film '
                         'The Grand Budapest Hotel.'])
示例#6
0
def test_if_parser_parses_escaped_json_string_well():
    parser = EscapedJSONStringParser()

    # Load a dummy escaped json-string file.
    raw = AuxiliaryFile(_get_resource_path('dummy.jsonstring.txt'))
    parser.prepare(raw)

    # Extract documents and parse the json-encoded strings.
    articles = []
    for document in parser.extract(raw):
        article = parser.parse(document)
        if article:
            articles.append(article)

    assert (articles == [
        'Wikipedia is a multilingual online encyclopedia '
        'created and maintained as an op en collaboration '
        'project by a community of volunteer editors using a '
        'wiki-based editing system. It is the largest and '
        'most popular general reference work on the World '
        'Wide Web. It is also one of the 15 most popular '
        'websites as ranked by Alexa, as of August 2020. It '
        'features exclusively free content and has no '
        'advertising. It is hosted by the Wikimedia '
        'Foundation, an American non-profit organization '
        'funded primarily through donations.\nWikipedia was '
        'launched on January 15, 2001, and was created by '
        'Jimmy Wales and Larry Sanger. Sanger coined its '
        'name as a portmanteau of the terms "wiki" and '
        '"encyclopedia". Initially an English-language '
        'encyclopedia, versions of Wikipedia in other '
        'languages were quickly developed. With 6.2 million '
        'articles, the English Wikipedia is the largest of '
        'the more than 300 Wikipedia encyclopedias. Overall, '
        'Wikipedia comprises more than 54 million articles '
        'attracting 1.5 billion unique visitors per month.',
        'In 2005, Nature published a peer review comparing '
        '42 hard science articles from Encyclopædia '
        'Britannica and Wikipedia and found that '
        'Wikipedia\'s level of accuracy approached that of '
        'Britannica, although critics suggested that it '
        'might not have fared so well in a similar study of '
        'a random sampling of all articles or one focused on '
        'social science or contentious social issues. The '
        'following year, Time stated that the open-door '
        'policy of allowing anyone to edit had made '
        'Wikipedia the biggest and possibly the best '
        'encyclopedia in the world, and was a testament to '
        'the vision of Jimmy Wales.\nWikipedia has been '
        'criticized for exhibiting systemic bias and for '
        'being subject to manipulation and spin in '
        'controversial topics; Edwin Black has criticized '
        'Wikipedia for presenting a mixture of "truth, half '
        'truth, and some falsehoods". Wikipedia has also '
        'been criticized for gender bias, particularly on '
        'its English-language version, where the dominant '
        'majority of editors are male. However, edit-a-thons '
        'have been held to encourage female editors and '
        'increase the coverage of women\'s topics. Facebook '
        'announced that by 2017 it would help readers detect '
        'fake news by suggesting links to related Wikipedia '
        'articles. YouTube announced a similar plan in 2018.',
        'Other collaborative online encyclopedias were '
        'attempted before Wikipedia, but none were as '
        'successful. Wikipedia began as a complementary '
        'project for Nupedia, a free online English-language '
        'encyclopedia project whose articles were written by '
        'experts and reviewed under a formal process. It was '
        'founded on March 9, 2000, under the ownership of '
        'Bomis, a web portal company. Its main figures were '
        'Bomis CEO Jimmy Wales and Larry Sanger, '
        'editor-in-chief for Nupedia and later Wikipedia. '
        'Nupedia was initially licensed under its own '
        'Nupedia Open Content License, but even before '
        'Wikipedia was founded, Nupedia switched to the GNU '
        'Free Documentation License at the urging of Richard '
        'Stallman. Wales is credited with defining the goal '
        'of making a publicly editable encyclopedia, while '
        'Sanger is credited with the strategy of using a '
        'wiki to reach that goal. On January 10, 2001, '
        'Sanger proposed on the Nupedia mailing list to '
        'create a wiki as a "feeder" project for Nupedia.\n'
        'The domains wikipedia.com and wikipedia.org were '
        'registered on January 12, 2001 and January 13, 2001 '
        'respectively, and Wikipedia was launched on January '
        '15, 2001, as a single English-language edition at '
        'www.wikipedia.com, and announced by Sanger on the '
        'Nupedia mailing list. Wikipedia\'s policy of '
        '"neutral point-of-view" was codified in its first '
        'few months. Otherwise, there were relatively few '
        'rules initially and Wikipedia operated '
        'independently of Nupedia. Originally, Bomis '
        'intended to make Wikipedia a business for profit.'
    ])