def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: # Calculate the optimum stride and bucket size. stride = max(1, self._total_lines_in_file(corpus) // self.best_seek_cnt) buckets = [ afm.create() for _ in range(min(stride * 2, self.max_buckets)) ] # Collect the corresponding seeking positions and shuffle them. offsets = self._collect_seek_offsets(corpus, stride) random.shuffle(offsets) with corpus.open('rb') as src, \ AuxiliaryFile.opens(buckets, 'wb') as dsts: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm( offsets, desc=colorful.render('<r>[*]</r> shuffle raw corpus file')) for offset in tqdm_iter: src.seek(offset) for _ in range(stride): line = src.readline() if not line: break # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' # Write the decorated text line to the random bucket for # ensuring randomness. dsts[random.randint(0, len(dsts) - 1)].write(line) # After splitting to the buckets, merge them into a single file. merged = afm.create() with merged.open('wb') as dst, \ AuxiliaryFile.opens(buckets, 'rb') as srcs: for src in srcs: shutil.copyfileobj(src, dst) return merged
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]: train_dataset = afm.create() val_dataset = afm.create() total_lines = self._total_lines_in_file(corpus) print( colorful.render(f'<r>[*]</r> split validation corpus - ' f'<m>{math.ceil(total_lines * self.val_ratio)}' f'</m> of <m>{total_lines}</m> lines')) with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \ val_dataset.open('wb') as vdst: # Write the first `val_ratio` lines to the validation dataset file. for i, line in enumerate(src): vdst.write(line) if i + 1 >= total_lines * self.val_ratio: break # After writing the validation dataset, copy the entire rest lines # to the train dataset. shutil.copyfileobj(src, tdst) return train_dataset, val_dataset
def _create_subset_file(self, afm: AuxiliaryFileManager, af: AuxiliaryFile) -> AuxiliaryFile: subset = afm.create() with af.open('rb') as src, subset.open('wb') as dst: while True: line = src.readline() if not line: break dst.write(line) # If total amount of copied data is more than `subset_size` # then stop copying data to the subset file. if src.tell() > self.subset_size: break return subset
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> AuxiliaryFile: merged = afm.create() print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files ' f'into one')) with merged.open('wb') as dst, \ AuxiliaryFile.opens(inputs, 'rb') as srcs: for src in srcs: for line in src: # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' dst.write(line) return merged
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def build(self, afm: AuxiliaryFileManager, raw: AuxiliaryFile) -> AuxiliaryFile: parsed = afm.create() self.parser.prepare(raw) # Create processes for parsing texts in parallel and a process for # collecting the parsed texts and saving to the auxiliary file. from_queue, to_queue = Queue(), Queue() parsers = [ Process(target=self._parse_worker, args=(from_queue, to_queue), daemon=True) for _ in range(self.num_workers) ] collector = Process(target=self._collect_worker, args=(parsed, to_queue), daemon=True) # Start the processes. print( colorful.render(f'<r>[*]</r> parse raw-formatted corpus file ' f'with <g>{self.parser.__class__.__name__}</g>')) for p in parsers: p.start() collector.start() # Feed the extracted raw-formatted document to each parser process. for document in self.parser.extract(raw): from_queue.put(document) for _ in range(self.num_workers): from_queue.put(None) # Wait for terminating the processes. for p in parsers: p.join() collector.join() return parsed