Exemplo n.º 1
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        subset = self._create_subset_file(afm, corpus)

        # Create WordPiece model with a normalizer and pre-tokenizer. Note that
        # BERT-specific normalizer and pre-tokenizer are used in this model.
        tokenizer = Tokenizer(WordPiece())
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()

        # Train tokenizer model with subset of corpus.
        trainer = WordPieceTrainer(vocab_size=self.vocab_size,
                                   min_frequency=2,
                                   show_progress=True,
                                   limit_alphabet=self.limit_alphabet,
                                   special_tokens=[self.unk_token] +
                                   self.special_tokens,
                                   continuing_subword_prefix='##')
        tokenizer.train(trainer, [subset.name])

        # Save trained vocabulary to an auxiliary output file.
        vocab = afm.create()
        tokenizer.model.save(os.path.dirname(vocab.name))

        os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'),
                  vocab.name)

        return vocab
Exemplo n.º 2
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged
Exemplo n.º 3
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]:
        train_dataset = afm.create()
        val_dataset = afm.create()

        total_lines = self._total_lines_in_file(corpus)
        print(
            colorful.render(f'<r>[*]</r> split validation corpus - '
                            f'<m>{math.ceil(total_lines * self.val_ratio)}'
                            f'</m> of <m>{total_lines}</m> lines'))

        with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \
                val_dataset.open('wb') as vdst:
            # Write the first `val_ratio` lines to the validation dataset file.
            for i, line in enumerate(src):
                vdst.write(line)
                if i + 1 >= total_lines * self.val_ratio:
                    break

            # After writing the validation dataset, copy the entire rest lines
            # to the train dataset.
            shutil.copyfileobj(src, tdst)

        return train_dataset, val_dataset
Exemplo n.º 4
0
    def _create_subset_file(self, afm: AuxiliaryFileManager,
                            af: AuxiliaryFile) -> AuxiliaryFile:
        subset = afm.create()
        with af.open('rb') as src, subset.open('wb') as dst:
            while True:
                line = src.readline()
                if not line:
                    break

                dst.write(line)

                # If total amount of copied data is more than `subset_size`
                # then stop copying data to the subset file.
                if src.tell() > self.subset_size:
                    break
        return subset
Exemplo n.º 5
0
    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> AuxiliaryFile:
        merged = afm.create()

        print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files '
                              f'into one'))
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(inputs, 'rb') as srcs:
            for src in srcs:
                for line in src:
                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    dst.write(line)

        return merged
Exemplo n.º 6
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Exemplo n.º 7
0
    def build(self, afm: AuxiliaryFileManager,
              raw: AuxiliaryFile) -> AuxiliaryFile:
        parsed = afm.create()
        self.parser.prepare(raw)

        # Create processes for parsing texts in parallel and a process for
        # collecting the parsed texts and saving to the auxiliary file.
        from_queue, to_queue = Queue(), Queue()
        parsers = [
            Process(target=self._parse_worker,
                    args=(from_queue, to_queue),
                    daemon=True) for _ in range(self.num_workers)
        ]
        collector = Process(target=self._collect_worker,
                            args=(parsed, to_queue),
                            daemon=True)

        # Start the processes.
        print(
            colorful.render(f'<r>[*]</r> parse raw-formatted corpus file '
                            f'with <g>{self.parser.__class__.__name__}</g>'))

        for p in parsers:
            p.start()
        collector.start()

        # Feed the extracted raw-formatted document to each parser process.
        for document in self.parser.extract(raw):
            from_queue.put(document)
        for _ in range(self.num_workers):
            from_queue.put(None)

        # Wait for terminating the processes.
        for p in parsers:
            p.join()
        collector.join()

        return parsed