def preprocess_corpora(args, dictionary_cls=Dictionary): if (args.train_source_binary_path is not None and args.train_target_binary_path is not None): if isinstance( utils.maybe_parse_collection_argument( args.train_source_binary_path), str) and isinstance( utils.maybe_parse_collection_argument( args.train_target_binary_path), str): args.train_source_binary_path = maybe_generate_temp_file_path( args.train_source_binary_path) args.train_target_binary_path = maybe_generate_temp_file_path( args.train_target_binary_path) args.eval_source_binary_path = maybe_generate_temp_file_path( args.eval_source_binary_path) args.eval_target_binary_path = maybe_generate_temp_file_path( args.eval_target_binary_path) # Additional text preprocessing options could be added here before # binarizing. if pytorch_translate_data.is_multilingual(args): preprocess_corpora_multilingual(args) elif pytorch_translate_data.is_multilingual_many_to_one(args): preprocess_corpora_multilingual_many_to_one(args, dictionary_cls) elif pytorch_translate_data.is_latent_variable(args): preprocess_corpora_latent_variable(args) else: # Vocabs are built before preprocessing because we might need to use # both monolingual and bilingual corpora sources to build the vocab # (in the case of semisupervised training) source_dict, char_source_dict, target_dict = build_vocabs( args=args, dictionary_cls=dictionary_cls) preprocess_bilingual_corpora( args=args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, ) # Binarize additional monolingual corpora for the semisupervised translation # task if (args.task == constants.SEMI_SUPERVISED_TASK or args.task == constants.DENOISING_AUTOENCODER_TASK): args.train_mono_source_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_source_binary_path", None)) args.train_mono_target_binary_path = maybe_generate_temp_file_path( output_path=getattr(args, "train_mono_target_binary_path", None)) preprocess_monolingual_corpora( args, source_dict=source_dict, char_source_dict=char_source_dict, target_dict=target_dict, )
def generate(args): pytorch_translate_options.print_args(args) models, model_args, task = pytorch_translate_utils.load_diverse_ensemble_for_inference( args.path.split(":") ) args.source_lang = model_args[0].source_lang args.target_lang = model_args[0].target_lang append_eos_to_source = model_args[0].append_eos_to_source reverse_source = model_args[0].reverse_source assert all( a.append_eos_to_source == append_eos_to_source and a.reverse_source == reverse_source for a in model_args ) if args.source_binary_file != "": assert args.target_binary_file != "" task.load_dataset( args.gen_subset, args.source_binary_file, args.target_binary_file ) elif pytorch_translate_data.is_multilingual_many_to_one(args): task.set_encoder_langs(model_args[0].multiling_encoder_lang) task.set_decoder_langs(model_args[0].multiling_decoder_lang) task.load_dataset_from_text_multilingual( args.gen_subset, source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, source_lang_id=task.get_encoder_lang_id(args.multiling_source_lang[0]), target_lang_id=task.get_decoder_lang_id(args.multiling_target_lang[0]), append_eos=append_eos_to_source, reverse_source=reverse_source, ) elif args.source_ensembling: task.load_multisource_dataset_from_text( args.gen_subset, source_text_files=args.source_text_file, target_text_file=args.target_text_file, append_eos=append_eos_to_source, reverse_source=reverse_source, ) else: task.load_dataset_from_text( args.gen_subset, source_text_file=args.source_text_file[0], target_text_file=args.target_text_file, append_eos=append_eos_to_source, reverse_source=reverse_source, ) lang_pair = None if isinstance(task, PyTorchTranslateMultiTask): if args.source_lang and args.target_lang: lang_pair = args.source_lang + "-" + args.target_lang else: lang_pair = "src-tgt" scorer, num_sentences, gen_timer, _ = generate_score( args=args, task=task, dataset=task.dataset(args.gen_subset), lang_pair=lang_pair, models=models, ) print( f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) " f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)" ) print( f"| Generate {args.gen_subset} with beam={args.beam}: " f"{scorer.result_string()}" ) return scorer.score()
def _iter_translations( args, task, dataset, translations, align_dict, rescorer, modify_target_dict ): """Iterate over translations. This is a generator function which wraps the beam-search sequence generator, performing such work on the output as converting token indices to strings, printing output where applicable (not args.quiet), collecting oracle translations where applicable, and removing language-ID tokens for multilingual translation. Args: args: Command-line arguments. task: FairseqTask object. dataset: Dataset set object for a specific split. translations: Batched translation iterator, as returned by SequenceGenerator.generate_batched_itr(). align_dict: Dictionary for UNK replacement. Yields: For each sentence in `translations`, yields a TranslationInfo. """ is_multilingual = pytorch_translate_data.is_multilingual_many_to_one(args) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() if is_multilingual: src_lang_id = ( src_tokens[-1] - pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET ) target_lang_id = ( target_tokens[0] - pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET ) # remove language ID tokens src_tokens = src_tokens[:-1] target_tokens = target_tokens[1:] # Select dictionaries src_dict = task.source_dictionaries[task.get_encoder_lang_code(src_lang_id)] target_dict = task.target_dictionaries[ task.get_decoder_lang_code(target_lang_id) ] else: src_dict = task.source_dictionary target_dict = task.target_dictionary # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.src.get_original_text(sample_id) target_str = dataset.tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) target_str = target_dict.string( target_tokens, args.remove_bpe, escape_unk=True ) if not args.quiet: print(f"S-{sample_id}\t{src_str}") print(f"T-{sample_id}\t{target_str}") # used for oracle evaluation (args.report_oracle_bleu) best_hypo_tokens = None best_hypo_score = 0 collect_oracle_hypos = args.report_oracle_bleu or ( args.output_hypos_binary_path and args.nbest > 0 ) # Process top predictions for i, hypo in enumerate(hypos[: min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"].int().cpu() if align_dict is not None else None, align_dict=align_dict, tgt_dict=task.target_dictionary, remove_bpe=args.remove_bpe, ) if not args.quiet: print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}") if alignment is not None: print( "A-{}\t{}".format( sample_id, " ".join(map(lambda x: str(utils.item(x)), alignment)), ) ) if collect_oracle_hypos: score = smoothed_sentence_bleu(task, target_tokens, hypo_tokens) if score > best_hypo_score: best_hypo_tokens = hypo_tokens best_hypo_score = score if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement # and/or without BPE target_tokens = task.target_dictionary.encode_line( target_str, add_if_not_exist=modify_target_dict ) # The probs score for the hypo_str; whether it's normalized by # sequence length or not depends on normalize_scores, which is # set by arg.nonormalize. # However, as I tried, whether normalize_scores is set or not, # the returned scores are the same (to be investigated). # Here, the probs are normalized by hypo length so the value # is big enough to be used as weights for backtranslations in # dual learning. hypo_score = ( hypo["score"] / len(hypo_tokens) if len(hypo_tokens) > 0 else 0.0 ) top_hypo_tokens = hypo_tokens top_hypo_str = hypo_str if not collect_oracle_hypos: best_hypo_tokens = top_hypo_tokens yield TranslationInfo( sample_id=sample_id, src_tokens=src_tokens, target_tokens=target_tokens, hypo_tokens=top_hypo_tokens, src_str=src_str, target_str=target_str, hypo_str=top_hypo_str, hypo_score=hypo_score, best_hypo_tokens=best_hypo_tokens, hypos=hypos, )
def _generate_score(models, args, task, dataset, modify_target_dict): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path.split(":")))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print("seed number is" + str(args.max_examples_to_evaluate_seed)) if args.max_examples_to_evaluate > 0: pytorch_translate_data.subsample_pair_dataset( dataset, args.max_examples_to_evaluate, args.max_examples_to_evaluate_seed ) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) hypos_list = [] collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescorer = None num_sentences = 0 translation_samples = [] translation_info_list = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual_many_to_one(args) else 0, ) for trans_info in _iter_translations( args, task, dataset, translations, align_dict, rescorer, modify_target_dict ): if hasattr(scorer, "add_string"): scorer.add_string(trans_info.target_str, trans_info.hypo_str) else: scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) if getattr(args, "translation_output_file", False): translated_sentences[trans_info.sample_id] = trans_info.hypo_str if getattr(args, "translation_probs_file", False): translated_scores[trans_info.sample_id] = trans_info.hypo_score if getattr(args, "hypotheses_export_path", False): hypos_list.append(trans_info.hypos) if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id ] = trans_info.best_hypo_tokens if args.translation_info_export_path is not None: # Strip expensive data from hypotheses before saving hypos = [ {k: v for k, v in hypo.items() if k in ["tokens", "score"]} for hypo in trans_info.hypos ] # Make sure everything is on cpu before exporting hypos = [ {"score": hypo["score"], "tokens": hypo["tokens"].cpu()} for hypo in hypos ] translation_info_list.append( { "src_tokens": trans_info.src_tokens.cpu(), "target_tokens": trans_info.target_tokens, "hypos": hypos, } ) translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryIndexedDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) if args.output_source_binary_path: dataset.src.save(args.output_source_binary_path) if args.translation_info_export_path is not None: f = open(args.translation_info_export_path, "wb") pickle.dump(translation_info_list, f) f.close() # If applicable, save the translations and scores to the output files # These two ouputs are used in dual learning for weighted backtranslation if getattr(args, "translation_output_file", False) and getattr( args, "translation_probs_file", False ): with open(args.translation_output_file, "w") as translation_file, open( args.translation_probs_file, "w" ) as score_file: for hypo_str, hypo_score in zip(translated_sentences, translated_scores): if len(hypo_str.strip()) > 0: print(hypo_str, file=translation_file) print(np.exp(hypo_score), file=score_file) # For eg. external evaluation if getattr(args, "hypotheses_export_path", False): with open(args.hypotheses_export_path, "w") as out_file: for hypos in hypos_list: for hypo in hypos: print( task.tgt_dict.string( hypo["tokens"], bpe_symbol=args.remove_bpe ), file=out_file, ) if oracle_scorer is not None: print(f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}") return scorer, num_sentences, gen_timer, translation_samples