Пример #1
0
def test_opening_multiple_auxiliary_files_at_once():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]

        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}th file')

        with AuxiliaryFile.opens(files, 'r') as fps:
            for i, fp in enumerate(fps):
                assert fp.read() == f'{i}th file'
Пример #2
0
    def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
        with raw.open('r') as fp:
            for line in fp:
                if not line.strip():
                    continue

                yield line
Пример #3
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged
Пример #4
0
    def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
        with raw.open('r') as fp:
            for prefix, event, value in ijson.parse(fp):
                if not prefix.endswith('.text'):
                    continue

                # Skip the redirection pages.
                if value.lower().strip().startswith('#redirect'):
                    continue

                yield value
Пример #5
0
    def _collect_worker(self, parsed: AuxiliaryFile, to_queue: Queue):
        terminated = 0
        with parsed.open('w') as fp:
            while terminated < self.num_workers:
                text = to_queue.get()
                if text is None:
                    terminated += 1
                    continue

                text += '\n' if not text.endswith('\n') else ''
                fp.write(text)
Пример #6
0
 def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
           ) -> AuxiliaryFile:
     # Note that an imported files would be wrapped with `AuxiliaryFile`
     # directly. Because the files are not created by `AuxiliaryFileManager`
     # but brought simply from existing external files, they do not need to
     # be removed. Namely, the manager does not have the ownership of them.
     files = []
     for path in self.paths:
         print(colorful.render(f'<r>[*]</r> import file from '
                               f'<b>{path}</b>'))
         files.append(AuxiliaryFile(path))
     return tuple(files)
Пример #7
0
def test_wikipedia_parser_extraction():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Extract documents from the parser.
    documents = list(parser.extract(raw))

    # The dummy dump file contains 3 full articles and other redirection pages.
    assert len(documents) == 3
Пример #8
0
def test_if_builder_adds_break_lines_automatically():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write('\n'.join([str(i) for _ in range(100)]))

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])
Пример #9
0
def test_merging_files_without_loss_of_contents():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}\n' * 100)

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])
Пример #10
0
def test_json_string_parser_extraction():
    parser = EscapedJSONStringParser()

    # Load a dummy escaped json-string file.
    raw = AuxiliaryFile(_get_resource_path('dummy.jsonstring.txt'))
    parser.prepare(raw)

    # Extract documents from the parser.
    documents = list(parser.extract(raw))

    # The dummy dump file contains 3 full articles and others are empty.
    assert len(documents) == 3
Пример #11
0
    def _create_subset_file(self, afm: AuxiliaryFileManager,
                            af: AuxiliaryFile) -> AuxiliaryFile:
        subset = afm.create()
        with af.open('rb') as src, subset.open('wb') as dst:
            while True:
                line = src.readline()
                if not line:
                    break

                dst.write(line)

                # If total amount of copied data is more than `subset_size`
                # then stop copying data to the subset file.
                if src.tell() > self.subset_size:
                    break
        return subset
Пример #12
0
    def _collect_seek_offsets(self, af: AuxiliaryFile,
                              stride: int) -> List[int]:
        offsets = []
        with af.open('rb') as fp:
            while True:
                current = fp.tell()

                # Read `stride` lines and move to the end of the chunk. If the
                # last line in the chunk is empty, then it means current is the
                # last of entire chunks.
                lines = [fp.readline() for _ in range(stride)]
                if not lines[-1]:
                    break

                # Gather the current position to the collection.
                offsets.append(current)
        return offsets
Пример #13
0
def test_wikipedia_parser_preparation():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Check if the parser extracts the namespaces in wikipedia correctly.
    assert (parser.namespaces
            == ['Media', 'Special', 'Talk', 'User', 'User talk', 'Wikipedia',
                'Wikipedia talk', 'File', 'File talk', 'MediaWiki',
                'MediaWiki talk', 'Template', 'Template talk', 'Help',
                'Help talk', 'Category', 'Category talk', 'Portal',
                'Portal talk', 'Book', 'Book talk', 'Draft', 'Draft talk',
                'Education Program', 'Education Program talk', 'TimedText',
                'TimedText talk', 'Module', 'Module talk', 'Gadget',
                'Gadget talk', 'Gadget definition', 'Gadget definition talk'])
Пример #14
0
    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> AuxiliaryFile:
        merged = afm.create()

        print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files '
                              f'into one'))
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(inputs, 'rb') as srcs:
            for src in srcs:
                for line in src:
                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    dst.write(line)

        return merged
Пример #15
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Пример #16
0
def test_if_parser_parses_mediawiki_codes_well():
    parser = WikipediaParser()

    # Load a dummy wikipedia dump file.
    raw = AuxiliaryFile(_get_resource_path('dummy.wiki.xml.bz2'))
    parser.prepare(raw)

    # Extract documents and parse the mediawiki codes.
    articles = []
    for document in parser.extract(raw):
        article = parser.parse(document)
        if article:
            articles.append(article)

    assert (articles == ['Archer is a slab serif typeface designed in 2001 by '
                         'Tobias Frere-Jones and Jonathan Hoefler for use in '
                         'Martha Stewart Living magazine. It was later '
                         'released by Hoefler & Frere-Jones for commercial '
                         'licensing.\n'
                         'The typeface is a geometric slab serif, one with a '
                         'geometric design similar to sans-serif fonts. It '
                         'takes inspiration from mid-twentieth century '
                         'designs such as Rockwell.\n'
                         'The face is unique for combining the geometric '
                         'structure of twentieth-century European slab-serifs '
                         'but imbuing the face with a domestic, less strident '
                         'tone of voice. Balls were added to the upper '
                         'terminals on letters such as C and G to increase '
                         'its charm. Italics are true italic designs, with '
                         'flourishes influenced by calligraphy, an unusual '
                         'feature for geometric slab serif designs. As with '
                         'many Hoefler & Frere-Jones designs, it was released '
                         'in a wide range of weights from hairline to bold, '
                         'reflecting its design goal as a typeface for '
                         'complex magazines.\n'
                         'The typeface has been used for, among other things, '
                         'branding for Wells Fargo and is a main font for the '
                         'San Francisco Chronicle and Wes Anderson\'s film '
                         'The Grand Budapest Hotel.'])
Пример #17
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]:
        train_dataset = afm.create()
        val_dataset = afm.create()

        total_lines = self._total_lines_in_file(corpus)
        print(
            colorful.render(f'<r>[*]</r> split validation corpus - '
                            f'<m>{math.ceil(total_lines * self.val_ratio)}'
                            f'</m> of <m>{total_lines}</m> lines'))

        with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \
                val_dataset.open('wb') as vdst:
            # Write the first `val_ratio` lines to the validation dataset file.
            for i, line in enumerate(src):
                vdst.write(line)
                if i + 1 >= total_lines * self.val_ratio:
                    break

            # After writing the validation dataset, copy the entire rest lines
            # to the train dataset.
            shutil.copyfileobj(src, tdst)

        return train_dataset, val_dataset
Пример #18
0
 def _total_lines_in_file(self, af: AuxiliaryFile) -> int:
     total_lines = 0
     with af.open('rb') as fp:
         for _ in fp:
             total_lines += 1
     return total_lines
Пример #19
0
 def build(self, afm, *inputs):
     files = [afm.create() for _ in self.texts]
     with AuxiliaryFile.opens(files, 'w') as fps:
         for i, fp in enumerate(fps):
             fp.write(self.texts[i])
     return tuple(files)
Пример #20
0
def test_if_parser_parses_escaped_json_string_well():
    parser = EscapedJSONStringParser()

    # Load a dummy escaped json-string file.
    raw = AuxiliaryFile(_get_resource_path('dummy.jsonstring.txt'))
    parser.prepare(raw)

    # Extract documents and parse the json-encoded strings.
    articles = []
    for document in parser.extract(raw):
        article = parser.parse(document)
        if article:
            articles.append(article)

    assert (articles == [
        'Wikipedia is a multilingual online encyclopedia '
        'created and maintained as an op en collaboration '
        'project by a community of volunteer editors using a '
        'wiki-based editing system. It is the largest and '
        'most popular general reference work on the World '
        'Wide Web. It is also one of the 15 most popular '
        'websites as ranked by Alexa, as of August 2020. It '
        'features exclusively free content and has no '
        'advertising. It is hosted by the Wikimedia '
        'Foundation, an American non-profit organization '
        'funded primarily through donations.\nWikipedia was '
        'launched on January 15, 2001, and was created by '
        'Jimmy Wales and Larry Sanger. Sanger coined its '
        'name as a portmanteau of the terms "wiki" and '
        '"encyclopedia". Initially an English-language '
        'encyclopedia, versions of Wikipedia in other '
        'languages were quickly developed. With 6.2 million '
        'articles, the English Wikipedia is the largest of '
        'the more than 300 Wikipedia encyclopedias. Overall, '
        'Wikipedia comprises more than 54 million articles '
        'attracting 1.5 billion unique visitors per month.',
        'In 2005, Nature published a peer review comparing '
        '42 hard science articles from Encyclopædia '
        'Britannica and Wikipedia and found that '
        'Wikipedia\'s level of accuracy approached that of '
        'Britannica, although critics suggested that it '
        'might not have fared so well in a similar study of '
        'a random sampling of all articles or one focused on '
        'social science or contentious social issues. The '
        'following year, Time stated that the open-door '
        'policy of allowing anyone to edit had made '
        'Wikipedia the biggest and possibly the best '
        'encyclopedia in the world, and was a testament to '
        'the vision of Jimmy Wales.\nWikipedia has been '
        'criticized for exhibiting systemic bias and for '
        'being subject to manipulation and spin in '
        'controversial topics; Edwin Black has criticized '
        'Wikipedia for presenting a mixture of "truth, half '
        'truth, and some falsehoods". Wikipedia has also '
        'been criticized for gender bias, particularly on '
        'its English-language version, where the dominant '
        'majority of editors are male. However, edit-a-thons '
        'have been held to encourage female editors and '
        'increase the coverage of women\'s topics. Facebook '
        'announced that by 2017 it would help readers detect '
        'fake news by suggesting links to related Wikipedia '
        'articles. YouTube announced a similar plan in 2018.',
        'Other collaborative online encyclopedias were '
        'attempted before Wikipedia, but none were as '
        'successful. Wikipedia began as a complementary '
        'project for Nupedia, a free online English-language '
        'encyclopedia project whose articles were written by '
        'experts and reviewed under a formal process. It was '
        'founded on March 9, 2000, under the ownership of '
        'Bomis, a web portal company. Its main figures were '
        'Bomis CEO Jimmy Wales and Larry Sanger, '
        'editor-in-chief for Nupedia and later Wikipedia. '
        'Nupedia was initially licensed under its own '
        'Nupedia Open Content License, but even before '
        'Wikipedia was founded, Nupedia switched to the GNU '
        'Free Documentation License at the urging of Richard '
        'Stallman. Wales is credited with defining the goal '
        'of making a publicly editable encyclopedia, while '
        'Sanger is credited with the strategy of using a '
        'wiki to reach that goal. On January 10, 2001, '
        'Sanger proposed on the Nupedia mailing list to '
        'create a wiki as a "feeder" project for Nupedia.\n'
        'The domains wikipedia.com and wikipedia.org were '
        'registered on January 12, 2001 and January 13, 2001 '
        'respectively, and Wikipedia was launched on January '
        '15, 2001, as a single English-language edition at '
        'www.wikipedia.com, and announced by Sanger on the '
        'Nupedia mailing list. Wikipedia\'s policy of '
        '"neutral point-of-view" was codified in its first '
        'few months. Otherwise, there were relatively few '
        'rules initially and Wikipedia operated '
        'independently of Nupedia. Originally, Bomis '
        'intended to make Wikipedia a business for profit.'
    ])
Пример #21
0
 def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
     with raw.open('r') as fp:
         articles = fp.read().split('\n\n')
         yield from articles
Пример #22
0
    def build(self, afm, *inputs):
        assert len(self.texts) == len(inputs)

        with AuxiliaryFile.opens(inputs, 'r') as fps:
            for text, fp in zip(self.texts, fps):
                assert fp.read() == text