def setup_task(cls, args, **kwargs): assert pytorch_translate_data.is_multilingual( args ), "Must set `--task pytorch_translate_multilingual` for multilingual training" args.left_pad_source = options.eval_bool(args.left_pad_source) def load_dicts(langs, paths): dicts = OrderedDict() for lang, dict_path in zip(langs, paths): d = pytorch_translate_dictionary.Dictionary.load(dict_path) dicts[lang] = d print(f"| [{lang}] dictionary: {len(d)} types") return dicts if not hasattr(args, "multiling_source_vocab_file"): args.multiling_encoder_lang = args.multiling_source_lang args.multiling_source_vocab_file = [args.source_vocab_file] if not hasattr(args, "multiling_target_vocab_file"): args.multiling_decoder_lang = args.multiling_target_lang args.multiling_target_vocab_file = [args.target_vocab_file] # Load dictionaries src_dicts = load_dicts(args.multiling_encoder_lang, args.multiling_source_vocab_file) tgt_dicts = load_dicts(args.multiling_decoder_lang, args.multiling_target_vocab_file) return cls(args, src_dicts, tgt_dicts)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) assert not pytorch_translate_data.is_multilingual( args ), "Must set `--task pytorch_translate_multilingual` for multilingual training" # Load dictionaries source_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) source_lang = args.source_lang or "src" target_lang = args.target_lang or "tgt" print(f"| [{source_lang}] dictionary: {len(source_dict)} types") print(f"| [{target_lang}] dictionary: {len(target_dict)} types") use_char_source = (args.char_source_vocab_file != "") or (getattr( args, "arch", "") == "char_source") if use_char_source: char_source_dict = pytorch_translate_dictionary.Dictionary.load( args.char_source_vocab_file) # this attribute is used for CharSourceModel construction args.char_source_dict_size = len(char_source_dict) else: char_source_dict = None return cls(args, source_dict, target_dict, char_source_dict)
def generate(args): pytorch_translate_options.print_args(args) # Setup task task = tasks.setup_task(args) models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference( args.path.split(":"), task) args.source_lang = model_args[0].source_lang args.target_lang = model_args[0].target_lang append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all(a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args) if args.source_binary_file != "": assert args.target_binary_file != "" task.load_dataset(args.gen_subset, args.source_binary_file, args.target_binary_file) elif pytorch_translate_data.is_multilingual(args): task.set_encoder_langs(model_args[0].multiling_encoder_lang) task.set_decoder_langs(model_args[0].multiling_decoder_lang) task.load_dataset_from_text_multilingual( args.gen_subset, source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_lang_id=task.get_encoder_lang_id( args.multiling_source_lang[0]), target_lang_id=task.get_decoder_lang_id( args.multiling_target_lang[0]), append_eos=append_eos_to_source, reverse_source=reverse_source, ) elif args.source_ensembling: task.load_multisource_dataset_from_text( args.gen_subset, source_text_files=args.source_text_file, target_text_file=args.target_text_file, append_eos=append_eos_to_source, reverse_source=reverse_source, ) else: task.load_dataset_from_text( args.gen_subset, source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, append_eos=append_eos_to_source, reverse_source=reverse_source, ) scorer, num_sentences, gen_timer, _ = _generate_score(models=models, args=args, task=task, dataset=task.dataset( args.gen_subset)) print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) " f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)") print(f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}") return scorer.score()
def build_model(cls, args, src_dict, dst_dict): """Build a new model instance.""" base_architecture(args) if pytorch_translate_data.is_multilingual(args): return RNNModel.build_model_multilingual(args, src_dict, dst_dict) encoder = RNNModel.build_encoder(args, src_dict) decoder = RNNModel.build_decoder(args, src_dict, dst_dict) return cls(encoder, decoder)
def build_model(cls, args, task): """Build a new model instance.""" base_architecture(args) # set default value for old checkpoints args.left_pad_source = getattr(args, "left_pad_source", True) if pytorch_translate_data.is_multilingual(args): return RNNModel.build_model_multilingual(args, task) src_dict, dst_dict = task.source_dictionary, task.target_dictionary encoder = RNNModel.build_encoder(args, src_dict) decoder = RNNModel.build_decoder(args, src_dict, dst_dict) return cls(task, encoder, decoder)
def preprocess_corpora(args): args.train_source_binary_path = maybe_generate_temp_file_path( args.train_source_binary_path) args.train_target_binary_path = maybe_generate_temp_file_path( args.train_target_binary_path) args.eval_source_binary_path = maybe_generate_temp_file_path( args.eval_source_binary_path) args.eval_target_binary_path = maybe_generate_temp_file_path( args.eval_target_binary_path) # Additional text preprocessing options could be added here before # binarizing. if pytorch_translate_data.is_multilingual(args): preprocess_corpora_multilingual(args) else: preprocess_corpora_bilingual(args)
def preprocess_corpora(args): args.train_source_binary_path = maybe_generate_temp_file_path( args.train_source_binary_path ) args.train_target_binary_path = maybe_generate_temp_file_path( args.train_target_binary_path ) args.eval_source_binary_path = maybe_generate_temp_file_path( args.eval_source_binary_path ) args.eval_target_binary_path = maybe_generate_temp_file_path( args.eval_target_binary_path ) # Additional text preprocessing options could be added here before # binarizing. if pytorch_translate_data.is_multilingual(args): preprocess_corpora_multilingual(args) else: # Vocabs are built before preprocessing because we might need to use # both monolingual and bilingual corpora sources to build the vocab # (in the case of semisupervised training) source_dict, char_source_dict, target_dict = build_vocabs(args=args) preprocess_bilingual_corpora( args=args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, ) # Binarize additional monolingual corpora for the semisupervised translation # task if args.task == constants.SEMI_SUPERVISED_TASK: args.train_mono_source_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_source_binary_path", None) ) args.train_mono_target_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_target_binary_path", None) ) preprocess_monolingual_corpora( args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, )
def _generate_score(models, args, task, dataset, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path.split(":")))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = get_eval_itr(args, models, task, dataset) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) if pytorch_translate_data.is_multilingual(args): first_best_translations = _iter_first_best_multilingual else: first_best_translations = _iter_first_best_bilingual for trans_info in first_best_translations( args, task, dataset, translations, align_dict ): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) return scorer, num_sentences, gen_timer, translation_samples
def generate(args): pytorch_translate_options.print_args(args) src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file) dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file) use_char_source = args.char_source_vocab_file != "" if use_char_source: char_source_dict = pytorch_translate_dictionary.Dictionary.load( args.char_source_vocab_file ) # this attribute is used for CharSourceModel construction args.char_source_dict_size = len(char_source_dict) else: char_source_dict = None dataset = data.LanguageDatasets( src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict ) models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference( args.path, dataset.src_dict, dataset.dst_dict ) append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all( a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args ) if args.source_binary_file != "": assert args.target_binary_file != "" dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( args.target_binary_file ) if use_char_source: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( args.source_binary_file ) gen_split = char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=src_dict.pad(), eos_idx=dst_dict.eos(), ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( args.source_binary_file ) gen_split = data.LanguagePairDataset( src=src_dataset, dst=dst_dataset, pad_idx=src_dict.pad(), eos_idx=dst_dict.eos(), ) elif pytorch_translate_data.is_multilingual(args): gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual( source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_lang_id=args.multiling_source_lang_id, target_lang_id=args.multiling_target_lang_id, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) elif args.source_ensembling: gen_split = multisource_data.make_multisource_language_pair_dataset_from_text( source_text_files=args.source_text_file, target_text_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, ) else: gen_split = pytorch_translate_data.make_language_pair_dataset_from_text( source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_dict=src_dict, target_dict=dst_dict, append_eos=append_eos_to_source, reverse_source=reverse_source, char_source_dict=char_source_dict, ) dataset.splits[args.gen_subset] = gen_split if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples") scorer, num_sentences, gen_timer, _ = _generate_score( models=models, args=args, dataset=dataset, dataset_split=args.gen_subset ) print( f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) " f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)" ) print( f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}" ) return scorer.score()
def _generate_score(models, args, dataset, dataset_split, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam ) # Initialize generator model_weights = None if args.model_weights: model_weights = [float(w.strip()) for w in args.model_weights.split(",")] use_char_source = isinstance(models[0], char_source_model.CharSourceModel) # Use a different sequence generator in the multisource setting if getattr(args, "source_ensembling", False): translator_class = multisource_decode.MultiSourceSequenceGenerator else: translator_class = beam_decode.SequenceGenerator translator = translator_class( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.length_penalty, unk_reward=args.unk_reward, word_reward=args.word_reward, model_weights=model_weights, use_char_source=use_char_source, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset.splits[dataset_split]) translated_scores = [0.0] * len(dataset.splits[dataset_split]) # Generate and compute BLEU score scorer = bleu.Scorer( dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk() ) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( dataset_split, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, skip_invalid_size_inputs_valid_test=(args.skip_invalid_size_inputs_valid_test), ) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError("--shard-id must be between 0 and num_shards") itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: gen_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length ) else: gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) if pytorch_translate_data.is_multilingual(args): first_best_translations = _iter_first_best_multilingual else: first_best_translations = _iter_first_best_bilingual for trans_info in first_best_translations( args, dataset, dataset_split, translations, align_dict ): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id, "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) return scorer, num_sentences, gen_timer, translation_samples
def _iter_translations(args, task, dataset, translations, align_dict, rescoring_model=None): """Iterate over translations. This is a generator function which wraps the beam-search sequence generator, performing such work on the output as converting token indices to strings, printing output where applicable (not args.quiet), collecting oracle translations where applicable, and removing language-ID tokens for multilingual translation. Args: args: Command-line arguments. task: FairseqTask object. dataset: Dataset set object for a specific split. translations: Batched translation iterator, as returned by SequenceGenerator.generate_batched_itr(). align_dict: Dictionary for UNK replacement. Yields: For each sentence in `translations`, yields a TranslationInfo. """ is_multilingual = pytorch_translate_data.is_multilingual(args) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() if is_multilingual: src_lang_id = (src_tokens[-1] - pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET) target_lang_id = ( target_tokens[0] - pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET) # remove language ID tokens src_tokens = src_tokens[:-1] target_tokens = target_tokens[1:] # Select dictionaries src_dict = task.source_dictionaries[task.get_encoder_lang_code( src_lang_id)] target_dict = task.target_dictionaries[task.get_decoder_lang_code( target_lang_id)] else: src_dict = task.source_dictionary target_dict = task.target_dictionary # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.src.get_original_text(sample_id) target_str = dataset.tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) target_str = target_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print(f"S-{sample_id}\t{src_str}") print(f"T-{sample_id}\t{target_str}") # used for oracle evaluation (args.report_oracle_bleu) best_hypo_tokens = None best_hypo_score = 0 collect_oracle_hypos = args.report_oracle_bleu or ( args.output_hypos_binary_path and args.nbest > 0) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"].int().cpu(), align_dict=align_dict, tgt_dict=task.target_dictionary, remove_bpe=args.remove_bpe, ) if not args.quiet: print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}") print("A-{}\t{}".format( sample_id, " ".join(map(lambda x: str(utils.item(x)), alignment)), )) if collect_oracle_hypos: score = smoothed_sentence_bleu(task, target_tokens, hypo_tokens) if score > best_hypo_score: best_hypo_tokens = hypo_tokens best_hypo_score = score if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement # and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, task.target_dictionary, add_if_not_exist=True) # The probs score for the hypo_str; whether it's normalized by # sequence length or not depends on normalize_scores, which is # set by arg.nonormalize. # However, as I tried, whether normalize_scores is set or not, # the returned scores are the same (to be investigated). # Here, the probs are normalized by hypo length so the value # is big enough to be used as weights for backtranslations in # dual learning. hypo_score = (hypo["score"] / len(hypo_tokens) if len(hypo_tokens) > 0 else 0.0) top_hypo_tokens = hypo_tokens if not collect_oracle_hypos: best_hypo_tokens = top_hypo_tokens hypo_tokens_after_rescoring = rescoring.run_rescoring( args, task, hypos[:args.nbest], src_tokens, rescoring_model) yield TranslationInfo( sample_id=sample_id, src_tokens=src_tokens, target_tokens=target_tokens, hypo_tokens=top_hypo_tokens, src_str=src_str, target_str=target_str, hypo_str=hypo_str, hypo_score=hypo_score, best_hypo_tokens=best_hypo_tokens, hypo_tokens_after_rescoring=hypo_tokens_after_rescoring, )