def _binarize_file_chunk( binarizer: Binarizer, filename: str, offset_start: int, offset_end: int, output_prefix: str, dataset_impl: str, vocab_size=None, ) -> tp.Tuple[tp.Any, BinarizeSummary]: # (dataset builder, BinarizeSummary) """ creates a dataset builder and append binarized items to it. This function does not finalize the builder, this is useful if you want to do other things with your bin file like appending/merging other files """ bin_file = indexed_dataset.data_file_path(output_prefix) ds = indexed_dataset.make_builder( bin_file, impl=dataset_impl, vocab_size=vocab_size, ) summary = BinarizeSummary() with Chunker( PathManager.get_local_path(filename), offset_start, offset_end ) as line_iterator: for line in line_iterator: ds.add_item(binarizer.binarize_line(line, summary)) return ds, summary
def _add_file_to_dictionary_single_worker( filename, tokenize, eos_word, start_offset, end_offset, ): counter = Counter() with Chunker(filename, start_offset, end_offset) as line_iterator: for line in line_iterator: for word in tokenize(line): counter.update([word]) counter.update([eos_word]) return counter
def test_readchunks(self): from fairseq.file_chunker_utils import Chunker, find_offsets offsets = find_offsets(self._tmpfile, self._num_splits) for start, end in zip(offsets, offsets[1:]): with Chunker(self._tmpfile, start, end) as lines: all_lines = list(lines) num_lines = self._num_lines / self._num_splits self.assertAlmostEqual( len(all_lines), num_lines, delta=1 ) # because we split on the bites, we might end up with one more/less line in a chunk self.assertListEqual( all_lines, [self._line_content for _ in range(len(all_lines))])
def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1) -> Dict[str, int]: nseq = 0 with Chunker(PathManager.get_local_path(filename), offset, end) as line_iterator: for line in line_iterator: ids = alignment_parser(line) nseq += 1 consumer(ids) return {"nseq": nseq}
def binarize( filename, dict, consumer, tokenize=tokenize_line, append_eos=True, reverse_order=False, offset=0, end=-1, already_numberized=False, ) -> Dict[str, int]: nseq, ntok = 0, 0 replaced = Counter() def replaced_consumer(word, idx): if idx == dict.unk_index and word != dict.unk_word: replaced.update([word]) with Chunker(PathManager.get_local_path(filename), offset, end) as line_iterator: for line in line_iterator: if already_numberized: id_strings = line.strip().split() id_list = [int(id_string) for id_string in id_strings] if reverse_order: id_list.reverse() if append_eos: id_list.append(dict.eos()) ids = torch.IntTensor(id_list) else: ids = dict.encode_line( line=line, line_tokenizer=tokenize, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, ) nseq += 1 ntok += len(ids) consumer(ids) return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def _consume_file( filename: str, binarizer: Binarizer, consumer: tp.Callable[[torch.IntTensor], None], offset_start: int, offset_end: int, ) -> tp.Dict[str, int]: summary = BinarizeSummary() with Chunker( PathManager.get_local_path(filename), offset_start, offset_end ) as line_iterator: for line in line_iterator: consumer(binarizer.binarize_line(line, summary)) return { "nseq": summary.num_seq, "nunk": summary.num_replaced, "ntok": summary.num_tok, "replaced": summary.replaced, }