Exemplo n.º 1
0
    def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
        with raw.open('r') as fp:
            for line in fp:
                if not line.strip():
                    continue

                yield line
Exemplo n.º 2
0
    def _collect_worker(self, parsed: AuxiliaryFile, to_queue: Queue):
        terminated = 0
        with parsed.open('w') as fp:
            while terminated < self.num_workers:
                text = to_queue.get()
                if text is None:
                    terminated += 1
                    continue

                text += '\n' if not text.endswith('\n') else ''
                fp.write(text)
Exemplo n.º 3
0
    def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
        with raw.open('r') as fp:
            for prefix, event, value in ijson.parse(fp):
                if not prefix.endswith('.text'):
                    continue

                # Skip the redirection pages.
                if value.lower().strip().startswith('#redirect'):
                    continue

                yield value
Exemplo n.º 4
0
    def _create_subset_file(self, afm: AuxiliaryFileManager,
                            af: AuxiliaryFile) -> AuxiliaryFile:
        subset = afm.create()
        with af.open('rb') as src, subset.open('wb') as dst:
            while True:
                line = src.readline()
                if not line:
                    break

                dst.write(line)

                # If total amount of copied data is more than `subset_size`
                # then stop copying data to the subset file.
                if src.tell() > self.subset_size:
                    break
        return subset
Exemplo n.º 5
0
    def _collect_seek_offsets(self, af: AuxiliaryFile,
                              stride: int) -> List[int]:
        offsets = []
        with af.open('rb') as fp:
            while True:
                current = fp.tell()

                # Read `stride` lines and move to the end of the chunk. If the
                # last line in the chunk is empty, then it means current is the
                # last of entire chunks.
                lines = [fp.readline() for _ in range(stride)]
                if not lines[-1]:
                    break

                # Gather the current position to the collection.
                offsets.append(current)
        return offsets
Exemplo n.º 6
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged
Exemplo n.º 7
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Exemplo n.º 8
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]:
        train_dataset = afm.create()
        val_dataset = afm.create()

        total_lines = self._total_lines_in_file(corpus)
        print(
            colorful.render(f'<r>[*]</r> split validation corpus - '
                            f'<m>{math.ceil(total_lines * self.val_ratio)}'
                            f'</m> of <m>{total_lines}</m> lines'))

        with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \
                val_dataset.open('wb') as vdst:
            # Write the first `val_ratio` lines to the validation dataset file.
            for i, line in enumerate(src):
                vdst.write(line)
                if i + 1 >= total_lines * self.val_ratio:
                    break

            # After writing the validation dataset, copy the entire rest lines
            # to the train dataset.
            shutil.copyfileobj(src, tdst)

        return train_dataset, val_dataset
Exemplo n.º 9
0
 def _total_lines_in_file(self, af: AuxiliaryFile) -> int:
     total_lines = 0
     with af.open('rb') as fp:
         for _ in fp:
             total_lines += 1
     return total_lines
Exemplo n.º 10
0
 def extract(self, raw: AuxiliaryFile) -> Iterable[str]:
     with raw.open('r') as fp:
         articles = fp.read().split('\n\n')
         yield from articles