Exemplos de AuxiliaryFileManager em Python, exemplos de langumo.utils.AuxiliaryFileManager em Python

Exemplo n.º 1

0

Exibir arquivo

    def build(self,
              afm: AuxiliaryFileManager,
              *inputs: AuxiliaryFile
              ) -> Union[None, AuxiliaryFile, Tuple[AuxiliaryFile, ...]]:
        with afm.auxiliary_scope():
            outputs = inputs
            for builder in self.builders:
                outputs = builder.build(afm, *outputs)

                # Lock output auxiliary files to protect from deleting for
                # passing to next builder.
                if isinstance(outputs, AuxiliaryFile):
                    outputs.lock()
                    outputs = (outputs,)
                elif isinstance(outputs, tuple):
                    for af in outputs:
                        if not isinstance(af, AuxiliaryFile):
                            raise TypeError(f'element {type(af)} is not an '
                                            f'auxiliary file.')
                        af.lock()
                elif outputs is None:
                    outputs = tuple()
                else:
                    # If the output of builder is not one of the allowed types
                    # (auxiliary file, tuple of auxiliary files, and None) then
                    # throw exception.
                    raise TypeError(f'output type {type(outputs)} from '
                                    f'builder is not allowed.')

                # Delete all unnecessary files except inputs and locked files.
                afm.clear()
        return outputs

Exemplo n.º 2

0

Exibir arquivo

Arquivo: miscellaneous.py Projeto: fossabot/langumo

    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> Tuple[AuxiliaryFile, ...]:
        outputs = tuple()
        for builder in self.builders:
            outputs += builder.build(afm, *inputs)
            afm.synchronize(outputs)

            # Lock input and stacked output auxiliary files.
            for af in inputs + outputs:
                af.lock()
            afm.clear()
        return outputs

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_shuffling.py Projeto: fossabot/langumo

def test_shuffling_preserves_contents():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write('\n'.join([str(i) for i in range(1000)]) + '\n')

        # In this case, the size of each chunk is 1 and it implies `complete
        # random shuffling`.
        with (ShuffleLines(best_seek_cnt=1000,
                           max_buckets=512).build(afm,
                                                  corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

        # If `best_seek_cnt` is less than the entire text lines, then the
        # shuffling would be approximated by using chunks and their buckets.
        with (ShuffleLines(best_seek_cnt=100,
                           max_buckets=512).build(afm,
                                                  corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

        with (ShuffleLines(best_seek_cnt=10,
                           max_buckets=512).build(afm,
                                                  corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

        # However, if `max_buckets` is less than the optimum bucket size (twice
        # of the optimum stride size), then only `max_buckets` of buckets would
        # be used. Note that this case leads reduction of randomness.
        with (ShuffleLines(best_seek_cnt=10,
                           max_buckets=64).build(afm, corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_splitting.py Projeto: fossabot/langumo

def test_builder_splits_corpus_without_loss_of_contents():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write('\n'.join(str(i) for i in range(1000)))

        # Test the splitting builder with various ratios.
        tfile, vfile = SplitValidation(val_ratio=0.1).build(afm, corpus)
        with tfile.open('r') as tfp, vfile.open('r') as vfp:
            assert ([int(s.strip())
                     for s in vfp.readlines()] == list(range(100)))
            assert ([int(s.strip())
                     for s in tfp.readlines()] == list(range(100, 1000)))

        tfile, vfile = SplitValidation(val_ratio=0.27).build(afm, corpus)
        with tfile.open('r') as tfp, vfile.open('r') as vfp:
            assert ([int(s.strip())
                     for s in vfp.readlines()] == list(range(270)))
            assert ([int(s.strip())
                     for s in tfp.readlines()] == list(range(270, 1000)))

        tfile, vfile = SplitValidation(val_ratio=0.1387).build(afm, corpus)
        with tfile.open('r') as tfp, vfile.open('r') as vfp:
            assert ([int(s.strip())
                     for s in vfp.readlines()] == list(range(139)))
            assert ([int(s.strip())
                     for s in tfp.readlines()] == list(range(139, 1000)))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: tokenization.py Projeto: affjljoo3581/langumo

    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        subset = self._create_subset_file(afm, corpus)

        # Create WordPiece model with a normalizer and pre-tokenizer. Note that
        # BERT-specific normalizer and pre-tokenizer are used in this model.
        tokenizer = Tokenizer(WordPiece())
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()

        # Train tokenizer model with subset of corpus.
        trainer = WordPieceTrainer(vocab_size=self.vocab_size,
                                   min_frequency=2,
                                   show_progress=True,
                                   limit_alphabet=self.limit_alphabet,
                                   special_tokens=[self.unk_token] +
                                   self.special_tokens,
                                   continuing_subword_prefix='##')
        tokenizer.train(trainer, [subset.name])

        # Save trained vocabulary to an auxiliary output file.
        vocab = afm.create()
        tokenizer.model.save(os.path.dirname(vocab.name))

        os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'),
                  vocab.name)

        return vocab

Exemplo n.º 6

0

Exibir arquivo

    def run(self, parent: str):
        with AuxiliaryFileManager(parent) as afm:
            self.build(afm)

            # After running the build pipeline, delete all created dummy files
            # even though the remainders would be removed in `__exit__` of the
            # manager.
            afm.clear()

Exemplo n.º 7

0

Exibir arquivo

    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_shuffling.py Projeto: fossabot/langumo

def test_builder_collects_seeking_positions_correctly():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        # Create an auxiliary file with 1000 dummy lines.
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write('hello world!\n' * 1000)

        builder = ShuffleLines()
        for s in range(1, 200):
            assert len(builder._collect_seek_offsets(corpus, s)) == 1000 // s

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_auxiliary.py Projeto: fossabot/langumo

def test_opening_multiple_auxiliary_files_at_once():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]

        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}th file')

        with AuxiliaryFile.opens(files, 'r') as fps:
            for i, fp in enumerate(fps):
                assert fp.read() == f'{i}th file'

Exemplo n.º 10

0

Exibir arquivo

def test_if_builder_adds_break_lines_automatically():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write('\n'.join([str(i) for _ in range(100)]))

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])

Exemplo n.º 11

0

Exibir arquivo

def test_merging_files_without_loss_of_contents():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}\n' * 100)

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_auxiliary.py Projeto: fossabot/langumo

def test_afm_creates_files_correctly():
    with tempfile.TemporaryDirectory() as tdir:
        with AuxiliaryFileManager(f'{tdir}/workspace') as afm:
            tfile = afm.create()

            with tfile.open('w') as fp:
                fp.write('hello world!')
            with tfile.open('r') as fp:
                assert fp.read() == 'hello world!'

            assert os.path.exists(tfile.name)
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 1
        assert not os.path.exists(f'{tdir}/workspace')

Exemplo n.º 13

0

Exibir arquivo

Arquivo: splitting.py Projeto: affjljoo3581/langumo

    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]:
        train_dataset = afm.create()
        val_dataset = afm.create()

        total_lines = self._total_lines_in_file(corpus)
        print(
            colorful.render(f'<r>[*]</r> split validation corpus - '
                            f'<m>{math.ceil(total_lines * self.val_ratio)}'
                            f'</m> of <m>{total_lines}</m> lines'))

        with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \
                val_dataset.open('wb') as vdst:
            # Write the first `val_ratio` lines to the validation dataset file.
            for i, line in enumerate(src):
                vdst.write(line)
                if i + 1 >= total_lines * self.val_ratio:
                    break

            # After writing the validation dataset, copy the entire rest lines
            # to the train dataset.
            shutil.copyfileobj(src, tdst)

        return train_dataset, val_dataset

Exemplo n.º 14

0

Exibir arquivo

Arquivo: tokenization.py Projeto: affjljoo3581/langumo

    def _create_subset_file(self, afm: AuxiliaryFileManager,
                            af: AuxiliaryFile) -> AuxiliaryFile:
        subset = afm.create()
        with af.open('rb') as src, subset.open('wb') as dst:
            while True:
                line = src.readline()
                if not line:
                    break

                dst.write(line)

                # If total amount of copied data is more than `subset_size`
                # then stop copying data to the subset file.
                if src.tell() > self.subset_size:
                    break
        return subset

Exemplo n.º 15

0

Exibir arquivo

Arquivo: mergence.py Projeto: fossabot/langumo

    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> AuxiliaryFile:
        merged = afm.create()

        print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files '
                              f'into one'))
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(inputs, 'rb') as srcs:
            for src in srcs:
                for line in src:
                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    dst.write(line)

        return merged

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_shuffling.py Projeto: fossabot/langumo

def test_shuffling_without_break_line_in_last():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            # Note that we would not add break-line character to the end of the
            # content.
            fp.write('\n'.join([str(i) for i in range(1000)]))

        with (ShuffleLines(best_seek_cnt=1000,
                           max_buckets=512).build(afm,
                                                  corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

        with (ShuffleLines(best_seek_cnt=10,
                           max_buckets=64).build(afm, corpus).open('r')) as fp:
            assert {int(i) for i in fp.read().split()} == set(range(1000))

Exemplo n.º 17

0

Exibir arquivo

def test_subword_tokenization():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write(_dummy_corpus_content)

        # Train WordPiece vocabulary and tokenize sentences.
        vocab = (TrainTokenizer(vocab_size=128,
                                limit_alphabet=64).build(afm, corpus))
        tokenized = (TokenizeSentences(unk_token='[UNK]').build(
            afm, corpus, vocab))

        # Test if the tokenization is correctly applied to the corpus. Note
        # that the tokenizer model will normalize the sentences.
        with tokenized.open('r') as fp:
            assert (fp.read().strip().replace('##', '').replace(
                ' ', '') == _dummy_corpus_content.lower().replace(' ', ''))

Exemplo n.º 18

0

Exibir arquivo

def test_subset_file_creation():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write('hello world!\n' * 100)

        with (TrainTokenizer(subset_size=1024)._create_subset_file(
                afm, corpus).open('r')) as fp:
            assert len(fp.readlines()) == 79

        with (TrainTokenizer(subset_size=128)._create_subset_file(
                afm, corpus).open('r')) as fp:
            assert len(fp.readlines()) == 10

        with (TrainTokenizer(subset_size=2000)._create_subset_file(
                afm, corpus).open('r')) as fp:
            assert len(fp.readlines()) == 100

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_splitting.py Projeto: fossabot/langumo

def test_counting_lines_in_file():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        builder = SplitValidation()
        corpus = afm.create()

        # Test for the case of 10 lines.
        with corpus.open('w') as fp:
            fp.write('hello world!\n' * 10)
        assert builder._total_lines_in_file(corpus) == 10

        # Test for the case of 100 lines.
        with corpus.open('w') as fp:
            fp.write('hello world!\n' * 100)
        assert builder._total_lines_in_file(corpus) == 100

        # Test for the case of 1548 lines.
        with corpus.open('w') as fp:
            fp.write('hello world!\n' * 1548)
        assert builder._total_lines_in_file(corpus) == 1548

Exemplo n.º 20

0

Exibir arquivo

def test_training_wordpiece_tokenizer():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write(_dummy_corpus_content)

        # Train WordPiece tokenizer and get vocabulary file.
        vocab = (TrainTokenizer(vocab_size=128,
                                limit_alphabet=64,
                                unk_token='[UNK]').build(afm, corpus))

        # Read subwords from the vocabulary file.
        with vocab.open('r') as fp:
            words = fp.readlines()

        # Check if the number of total words equals to vocabulary size and the
        # vocabulary contains unknown token.
        assert len(words) == 128
        assert words[0].strip() == '[UNK]'

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_auxiliary.py Projeto: fossabot/langumo

def test_afm_ignores_locked_files_in_clearing():
    with tempfile.TemporaryDirectory() as tdir:
        with AuxiliaryFileManager(f'{tdir}/workspace') as afm:
            for _ in range(10):
                afm.create()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 10

            for _ in range(5):
                afm.create().lock()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 15

            # All auxiliary files except locked ones would be removed. Note
            # that the remainders are unlocked at this point.
            afm.clear()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 5

            # As mentioned above, all unlocked files would be remove at this
            # point.
            afm.clear()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 0
        assert not os.path.exists(f'{tdir}/workspace')

Exemplo n.º 22

0

Exibir arquivo

    def run(self, parent: str):
        """Execute the builder.

        All builders can be executed directly and independently, without any
        input auxiliary files. We recommend to execute builders with
        miscellaneous ones (e.g.
        :class:`ImportFrom <langumo.building.miscellaneous.ImportFrom>` and
        :class:`ExportTo <langumo.building.miscellaneous.ExportTo>`) to pass
        build inputs correctly.

        Args:
            parent: parent workspace directory which will be used for
                containing all auxiliary files.
        """
        with AuxiliaryFileManager(parent) as afm:
            self.build(afm)

            # After running the build pipeline, delete all created dummy files
            # even though the remainders would be removed in `__exit__` of the
            # manager.
            afm.clear()

Exemplo n.º 23

0

Exibir arquivo

Arquivo: tokenization.py Projeto: affjljoo3581/langumo

    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_auxiliary.py Projeto: fossabot/langumo

def test_afm_handles_files_separately_by_level():
    with tempfile.TemporaryDirectory() as tdir:
        with AuxiliaryFileManager(f'{tdir}/workspace') as afm:
            # Create auxiliary files in level 0.
            for _ in range(10):
                afm.create()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 10

            with afm.auxiliary_scope():
                # Create auxiliary files in level 1.
                for _ in range(5):
                    afm.create()
                assert _number_of_files_in_directory(f'{tdir}/workspace') == 15

                # Remove the auxiliary files with level 1.
                afm.clear()
                assert _number_of_files_in_directory(f'{tdir}/workspace') == 10

                # Create auxiliary files and lock some of them.
                for _ in range(2):
                    afm.create()
                for _ in range(3):
                    afm.create().lock()
                assert _number_of_files_in_directory(f'{tdir}/workspace') == 15

                with afm.auxiliary_scope():
                    # Create auxiliary files in level 2.
                    for _ in range(5):
                        afm.create()
                    assert (_number_of_files_in_directory(f'{tdir}/workspace')
                            == 20)

                # Not only non-locked files but also sub-level auxiliary files
                # would be removed.
                afm.clear()
                assert _number_of_files_in_directory(f'{tdir}/workspace') == 13

            afm.clear()
            assert _number_of_files_in_directory(f'{tdir}/workspace') == 0

Exemplo n.º 25

0

Exibir arquivo

Arquivo: parsing.py Projeto: fossabot/langumo

    def build(self, afm: AuxiliaryFileManager,
              raw: AuxiliaryFile) -> AuxiliaryFile:
        parsed = afm.create()
        self.parser.prepare(raw)

        # Create processes for parsing texts in parallel and a process for
        # collecting the parsed texts and saving to the auxiliary file.
        from_queue, to_queue = Queue(), Queue()
        parsers = [
            Process(target=self._parse_worker,
                    args=(from_queue, to_queue),
                    daemon=True) for _ in range(self.num_workers)
        ]
        collector = Process(target=self._collect_worker,
                            args=(parsed, to_queue),
                            daemon=True)

        # Start the processes.
        print(
            colorful.render(f'<r>[*]</r> parse raw-formatted corpus file '
                            f'with <g>{self.parser.__class__.__name__}</g>'))

        for p in parsers:
            p.start()
        collector.start()

        # Feed the extracted raw-formatted document to each parser process.
        for document in self.parser.extract(raw):
            from_queue.put(document)
        for _ in range(self.num_workers):
            from_queue.put(None)

        # Wait for terminating the processes.
        for p in parsers:
            p.join()
        collector.join()

        return parsed

Exemplo n.º 26

0

Exibir arquivo

def test_formatted_file_parsing():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        corpus = afm.create()
        with corpus.open('w') as fp:
            fp.write(_dummy_corpus_content)

        with (ParseRawFile(simple_parser(),
                           lang='en',
                           min_len=16,
                           max_len=512,
                           newline='[NEWLINE]',
                           num_workers=2).build(afm, corpus).open('r')) as fp:
            assert ({
                s.strip()
                for s in fp
            } == {
                'Wikipedia is a multilingual online encyclopedia created '
                'and maintained as an open collaboration project by a '
                'community of volunteer editors using a wiki-based '
                'editing system. It is the largest and most popular '
                'general reference work on the World Wide Web. It is '
                'also one of the 15 most popular websites ranked by '
                'Alexa, as of August 2020. It features exclusively free '
                'content and no commercial ads. It is hosted by the '
                'Wikimedia Foundation, a non-profit organization funded '
                'primarily through donations. [NEWLINE] Wikipedia was '
                'launched on January 15, 2001, and was created by Jimmy '
                'Wales and Larry Sanger.',
                'Sanger coined its name as a portmanteau of the terms '
                '"wiki" and "encyclopedia". Initially an '
                'English-language encyclopedia, versions of Wikipedia in '
                'other languages were quickly developed. With 6.1 '
                'million articles, the English Wikipedia is the largest '
                'of the more than 300 Wikipedia encyclopedias. Overall, '
                'Wikipedia comprises more than 54 million articles '
                'attracting 1.5 billion unique visitors per month.',
                'In 2005, Nature published a peer review comparing 42 '
                'hard science articles from Encyclopædia Britannica and '
                'Wikipedia and found that Wikipedia\'s level of accuracy '
                'approached that of Britannica, although critics '
                'suggested that it might not have fared so well in a '
                'similar study of a random sampling of all articles or '
                'one focused on social science or contentious social '
                'issues. The following year, Time stated that the '
                'open-door policy of allowing anyone to edit had made '
                'Wikipedia the biggest and possibly the best '
                'encyclopedia in the world, and was a testament to the '
                'vision of Jimmy Wales.',
                'Wikipedia has been criticized for exhibiting systemic '
                'bias and for being subject to manipulation and spin in '
                'controversial topics; Edwin Black has criticized '
                'Wikipedia for presenting a mixture of "truth, half '
                'truth, and some falsehoods". Wikipedia has also been '
                'criticized for gender bias, particularly on its '
                'English-language version, where the dominant majority '
                'of editors are male. However, edit-a-thons have been '
                'held to encourage female editors and increase the '
                'coverage of women\'s topics. Facebook announced that by '
                '2017 it would help readers detect fake news by '
                'suggesting links to related Wikipedia articles.',
                'YouTube announced a similar plan in 2018.'
            })

Exemplo n.º 27

0

Exibir arquivo

Arquivo: test_auxiliary.py Projeto: fossabot/langumo

def test_afm_context_manager():
    with tempfile.TemporaryDirectory() as tdir:
        with AuxiliaryFileManager(f'{tdir}/workspace'):
            assert os.path.exists(f'{tdir}/workspace')
        assert not os.path.exists(f'{tdir}/workspace')