def _make_edge_dataset(vocab, input_prefix, output_prefix, lang, num_workers, output_text_file): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") ds = [] merge_result( Binarizer.binarize_graph(input_file, vocab, lambda t: ds.append(t))) import json with open(output_text_file, 'w') as f: for line in ds: f.write(json.dumps(line.numpy().tolist()) + '\n') print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def binarize_graph(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, lang, "bin"), impl=args.dataset_impl) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize_graph(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res