def load_binarized_dataset( train_corpus: ParallelCorpusConfig, eval_corpus: ParallelCorpusConfig, train_split: str, eval_split: str, args: argparse.Namespace, use_char_source: bool = False, ) -> data.LanguageDatasets: source_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) if use_char_source: char_source_dict = pytorch_translate_dictionary.Dictionary.load( args.char_source_vocab_file) # this attribute is used for CharSourceModel construction args.char_source_dict_size = len(char_source_dict) dataset = data.LanguageDatasets( src=train_corpus.source.dialect, dst=train_corpus.target.dialect, src_dict=source_dict, dst_dict=target_dict, ) for split, corpus in [(train_split, train_corpus), (eval_split, eval_corpus)]: if not os.path.exists(corpus.source.data_file): raise ValueError( f"{corpus.source.data_file} for {split} not found!") if not os.path.exists(corpus.target.data_file): raise ValueError( f"{corpus.target.data_file} for {split} not found!") dst_dataset = InMemoryNumpyDataset.create_from_file( corpus.target.data_file) if use_char_source: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) dataset.splits[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) else: src_dataset = InMemoryNumpyDataset.create_from_file( corpus.source.data_file) dataset.splits[split] = data.LanguagePairDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) return dataset
def generate(args): assert_test_corpus_and_vocab_files_specified(args) assert args.path is not None, '--path required for generation!' print(args) if args.source_lang is None: args.source_lang = 'src' if args.target_lang is None: args.target_lang = 'tgt' src_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file, ) dst_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file, ) dataset = data.LanguageDatasets( src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict, ) models, model_args = utils.load_ensemble_for_inference( args.path, dataset.src_dict, dataset.dst_dict, ) dataset.splits[args.gen_subset] = pytorch_translate_data.make_language_pair_dataset( source_file=args.source_text_file, target_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=model_args.append_eos_to_source, reverse_source=model_args.reverse_source, ) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print(f'| [{dataset.src}] dictionary: {dataset.src_dict} types') print(f'| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types') print(f'| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples') scorer, num_sentences, gen_timer = _generate_score( models=models, args=args, dataset=dataset, dataset_split=args.gen_subset, ) print(f'| Translated {num_sentences} sentences ({gen_timer.n} tokens) ' f'in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)') print(f'| Generate {args.gen_subset} with beam={args.beam}: ' f'{scorer.result_string()}') return scorer.score()
def generate(args): assert_test_corpus_and_vocab_files_specified(args) assert args.path is not None, "--path required for generation!" print(args) if args.source_lang is None: args.source_lang = "src" if args.target_lang is None: args.target_lang = "tgt" src_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) dst_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) dataset = data.LanguageDatasets(src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict) models, model_args = load_diverse_ensemble_for_inference( args.path, dataset.src_dict, dataset.dst_dict) append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all(a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args) dataset.splits[ args. gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text( source_text_file=args.source_text_file, target_text_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") print( f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples") scorer, num_sentences, gen_timer = _generate_score( models=models, args=args, dataset=dataset, dataset_split=args.gen_subset) print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) " f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)") print(f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}") return scorer.score()
def generate(args): assert_test_corpus_and_vocab_files_specified(args) assert args.path is not None, '--path required for generation!' print(args) if args.source_lang is None: args.source_lang = 'src' if args.target_lang is None: args.target_lang = 'tgt' src_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file, ) dst_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file, ) dataset = data.LanguageDatasets( src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict, ) dataset.splits[ args.gen_subset] = pytorch_translate_data.make_language_pair_dataset( source_file=args.source_text_file, target_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, args=args, ) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) print( '| {} {} examples'.format(args.gen_subset, len(dataset.splits[args.gen_subset])), ) scorer, num_sentences, gen_timer = generate_score( args=args, dataset=dataset, dataset_split=args.gen_subset, ) print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'. format(num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer.score()
def load_raw_text_dataset( train_corpus: ParallelCorpusConfig, eval_corpus: ParallelCorpusConfig, train_split: str, eval_split: str, args: argparse.Namespace, ) -> data.LanguageDatasets: source_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file) dataset = data.LanguageDatasets( src=train_corpus.source.dialect, dst=train_corpus.target.dialect, src_dict=source_dict, dst_dict=target_dict, ) prev_source_dialect = None prev_target_dialect = None for split, corpus in [ (train_split, train_corpus), (eval_split, eval_corpus), ]: # Sanity check that all language directions are consistent until this # has been updated to support multilingual corpora. if prev_source_dialect is None and prev_target_dialect is None: prev_source_dialect = corpus.source.dialect prev_target_dialect = corpus.target.dialect elif (prev_source_dialect != corpus.source.dialect or prev_target_dialect != corpus.target.dialect): raise ValueError( 'We currently only support monolingual directions - expected ' '{}->{} for all corpora, but found {}->{} for split {}'.format( prev_source_dialect, prev_target_dialect, corpus.source.dialect, corpus.target.dialect, split, ) ) dataset.splits[split] = make_language_pair_dataset( source_file=corpus.source.data_file, target_file=corpus.target.data_file, source_dict=source_dict, target_dict=target_dict, args=args, ) return dataset
def load_binarized_dataset( train_corpus: ParallelCorpusConfig, eval_corpus: ParallelCorpusConfig, train_split: str, eval_split: str, args: argparse.Namespace, ) -> data.LanguageDatasets: source_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) dataset = data.LanguageDatasets( src=train_corpus.source.dialect, dst=train_corpus.target.dialect, src_dict=source_dict, dst_dict=target_dict, ) for split, corpus in [(train_split, train_corpus), (eval_split, eval_corpus)]: if (not indexed_dataset.IndexedInMemoryDataset.exists( corpus.source.data_file) or not indexed_dataset.IndexedInMemoryDataset.exists( corpus.target.data_file)): raise ValueError( f"One or both of source file: {corpus.source.data_file} and " f"target file: {corpus.target.data_file} for split {split} " f"was not found.") dataset.splits[split] = data.LanguagePairDataset( src=indexed_dataset.IndexedInMemoryDataset( corpus.source.data_file), dst=indexed_dataset.IndexedInMemoryDataset( corpus.target.data_file), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) return dataset
def load_binarized_dataset( train_corpus: ParallelCorpusConfig, eval_corpus: ParallelCorpusConfig, train_split: str, eval_split: str, args: argparse.Namespace, ) -> data.LanguageDatasets: source_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) dataset = data.LanguageDatasets( src=train_corpus.source.dialect, dst=train_corpus.target.dialect, src_dict=source_dict, dst_dict=target_dict, ) for split, corpus in [(train_split, train_corpus), (eval_split, eval_corpus)]: if not os.path.exists(corpus.source.data_file): raise ValueError( f"{corpus.source.data_file} for {split} not found!") if not os.path.exists(corpus.target.data_file): raise ValueError( f"{corpus.target.data_file} for {split} not found!") dataset.splits[split] = data.LanguagePairDataset( src=InMemoryNumpyDataset.create_from_file(corpus.source.data_file), dst=InMemoryNumpyDataset.create_from_file(corpus.target.data_file), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) return dataset
def generate(args): assert_test_corpus_and_vocab_files_specified(args) assert args.path is not None, "--path required for generation!" print(args) # Benchmarking should be language-agnostic args.source_lang = "src" args.target_lang = "tgt" src_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) dst_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) # Generate synthetic raw text files source_text_file = generate_synthetic_text(args.source_lang, src_dict.symbols, args) target_text_file = generate_synthetic_text(args.target_lang, src_dict.symbols, args) dataset = data.LanguageDatasets(src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict) models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference( args.path, dataset.src_dict, dataset.dst_dict) append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all(a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args) dataset.splits[ args. gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text( source_text_file=source_text_file, target_text_file=target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) # Remove temporary text files os.remove(source_text_file) os.remove(target_text_file) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") print( f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples") args.keep_detailed_timing = True scorer, num_sentences, gen_timer = pytorch_translate_generate._generate_score( models=models, args=args, dataset=dataset, dataset_split=args.gen_subset) # Remove contribution of primer sentence gen_timer.reset_bucket(0) print( f"| Translated {num_sentences} sentences ({sum(gen_timer.n)} tokens) " f"in {sum(gen_timer.sum):.3f}s ({1. / gen_timer.avg:.2f} tokens/s)") for bucket_id in range(gen_timer.n_buckets): if gen_timer.n[bucket_id] != 0: print( " | Length {}: {} sentences ({} tok) in {:.3f}s ({:.3f} tok/s, avg. latency {:4f}s)" .format( bucket_id * args.increment, gen_timer.count[bucket_id], gen_timer.n[bucket_id], gen_timer.sum[bucket_id], 1. / gen_timer.avgs[bucket_id], gen_timer.sum[bucket_id] / gen_timer.count[bucket_id], )) print(f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}") return scorer.score()
def generate(args): pytorch_translate_options.print_args(args) src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file) dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file) use_char_source = args.char_source_vocab_file != "" if use_char_source: char_source_dict = pytorch_translate_dictionary.Dictionary.load( args.char_source_vocab_file ) # this attribute is used for CharSourceModel construction args.char_source_dict_size = len(char_source_dict) else: char_source_dict = None dataset = data.LanguageDatasets( src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict ) models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference( args.path, dataset.src_dict, dataset.dst_dict ) append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all( a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args ) if args.source_binary_file != "": assert args.target_binary_file != "" dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( args.target_binary_file ) if use_char_source: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( args.source_binary_file ) gen_split = char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=src_dict.pad(), eos_idx=dst_dict.eos(), ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( args.source_binary_file ) gen_split = data.LanguagePairDataset( src=src_dataset, dst=dst_dataset, pad_idx=src_dict.pad(), eos_idx=dst_dict.eos(), ) elif pytorch_translate_data.is_multilingual(args): gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual( source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_lang_id=args.multiling_source_lang_id, target_lang_id=args.multiling_target_lang_id, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) elif args.source_ensembling: gen_split = multisource_data.make_multisource_language_pair_dataset_from_text( source_text_files=args.source_text_file, target_text_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) else: gen_split = pytorch_translate_data.make_language_pair_dataset_from_text( source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, char_source_dict=char_source_dict, ) dataset.splits[args.gen_subset] = gen_split if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples") scorer, num_sentences, gen_timer, _ = _generate_score( models=models, args=args, dataset=dataset, dataset_split=args.gen_subset ) print( f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) " f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)" ) print( f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}" ) return scorer.score()