def make_binary_dataset(input_prefix, output_prefix, lang, guess): print('aaa') dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) if not guess: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}'.format(input_prefix, lang) else: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.guess.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}.guess'.format(input_prefix, lang) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang))
def binarize_text_file( text_file: str, dictionary: pytorch_translate_dictionary.Dictionary, output_prefix: str, append_eos: bool, reverse_order: bool, ) -> str: if not output_prefix: fd, output_prefix = tempfile.mkstemp() # We only need a unique file name prefix, since the helper functions # take care of actually creating the file. os.close(fd) print(f"Outputting binarized version of {text_file} to " f"{indexed_dataset.data_file_path(output_prefix)} and " f"{indexed_dataset.index_file_path(output_prefix)}") builder = indexed_dataset.IndexedDatasetBuilder( indexed_dataset.data_file_path(output_prefix)) def consumer(tensor): builder.add_item(tensor) counters = tokenizer.Tokenizer.binarize( filename=text_file, dict=dictionary, consumer=consumer, append_eos=append_eos, reverse_order=reverse_order, ) print(f"Binarizing {text_file}: {counters['nseq']} sents, " f"{counters['ntok']} tokens, " f"{100 * counters['nunk'] / counters['ntok']:.3}% replaced by " f"{dictionary.unk_word}.") builder.finalize(indexed_dataset.index_file_path(output_prefix)) return output_prefix