def _generate_score(models, args, dataset, dataset_split): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print(f"| loading model(s) from {', '.join(args.path)}") # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam ) # Initialize generator translator = beam_decode.SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, word_reward=args.word_reward, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Generate and compute BLEU score scorer = bleu.Scorer( dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk(), ) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( dataset_split, max_sentences=args.max_sentences, max_positions=max_positions, skip_invalid_size_inputs_valid_test=( args.skip_invalid_size_inputs_valid_test ), ) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError('--shard-id must be between 0 and num_shards') itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[dataset_split].src.\ get_original_text(sample_id) target_str = dataset.splits[dataset_split].dst.\ get_original_text(sample_id) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string( target_tokens, args.remove_bpe, escape_unk=True, ) if not args.quiet: print(f'S-{sample_id}\t{src_str}') print(f'T-{sample_id}\t{target_str}') # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe) if not args.quiet: print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}") print(f"A-{sample_id}\t{' '.join(map(lambda x: str(utils.item(x)), alignment))}") # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement # and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, dataset.dst_dict, add_if_not_exist=True, ) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 return scorer, num_sentences, gen_timer
def _generate_score(models, args, task, dataset): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join( args.path.split(":")))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print("seed number is" + str(args.max_examples_to_evaluate_seed)) if args.max_examples_to_evaluate > 0: pytorch_translate_data.subsample_pair_dataset( dataset, args.max_examples_to_evaluate, args.max_examples_to_evaluate_seed) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) hypos_list = [] collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescorer = None num_sentences = 0 translation_samples = [] translation_info_list = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual_many_to_one(args) else 0, ) for trans_info in _iter_translations(args, task, dataset, translations, align_dict, rescorer): if hasattr(scorer, "add_string"): scorer.add_string(trans_info.target_str, trans_info.hypo_str) else: scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) if getattr(args, "translation_output_file", False): translated_sentences[ trans_info.sample_id] = trans_info.hypo_str if getattr(args, "translation_probs_file", False): translated_scores[trans_info.sample_id] = trans_info.hypo_score if getattr(args, "hypotheses_export_path", False): hypos_list.append(trans_info.hypos) if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id] = trans_info.best_hypo_tokens if args.translation_info_export_path is not None: # Strip expensive data from hypotheses before saving hypos = [{ k: v for k, v in hypo.items() if k in ["tokens", "score"] } for hypo in trans_info.hypos] # Make sure everything is on cpu before exporting hypos = [{ "score": hypo["score"], "tokens": hypo["tokens"].cpu() } for hypo in hypos] translation_info_list.append({ "src_tokens": trans_info.src_tokens.cpu(), "target_tokens": trans_info.target_tokens, "hypos": hypos, }) translation_samples.append( collections.OrderedDict({ "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, })) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryIndexedDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) if args.output_source_binary_path: dataset.src.save(args.output_source_binary_path) if args.translation_info_export_path is not None: f = open(args.translation_info_export_path, "wb") pickle.dump(translation_info_list, f) f.close() # If applicable, save the translations and scores to the output files # These two ouputs are used in dual learning for weighted backtranslation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) # For eg. external evaluation if getattr(args, "hypotheses_export_path", False): with open(args.hypotheses_export_path, "w") as out_file: for hypos in hypos_list: for hypo in hypos: print( task.tgt_dict.string(hypo["tokens"], bpe_symbol=args.remove_bpe), file=out_file, ) if oracle_scorer is not None: print( f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}" ) return scorer, num_sentences, gen_timer, translation_samples
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary #pickle.dump(src_dict, open("./translations/seethetensors/src_dict.pkl", "bw") ) #pickle.dump(tgt_dict, open("./translations/seethetensors/tgt_dict.pkl", "bw") ) #print("* args.remove_bpe : ", args.remove_bpe) #bpe_symbol = args.remove_bpe #pickle.dump(bpe_symbol, open("./translations/seethetensors/bpe_symbol.pkl", "bw") ) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, bert_ratio=args.bert_ratio if args.change_ratio else None, encoder_ratio=args.encoder_ratio if args.change_ratio else None, geargs=args, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) #pickle.dump(align_dict, open("./translations/seethetensors/align_dict.pkl", "bw")) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True resdict = {} #hypo_strings = [] ##!!! #src_strings = [] ##!!! results_path = args.results_path stamp = str(time.time()) resfp = results_path + "/" + args.gen_subset + "." + stamp + ".gen_sparql.json" #resfp_ = results_path+"/"+args.gen_subset+"."+stamp+".txt" #sampfp = results_path+"/"+args.gen_subset+"."+stamp+".sampleids.txt" with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) #open(sampfp, "w", encoding="UTF-8").writeline(str(sample['id'].tolist())) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) #src_strings.append(src_str) ##!!! if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) #hypo_strings.append(hypo_str) ##!!! resdict[str(int(sample_id) + 1)] = { "sparql": interprete(hypo_str), "en": src_str } ##!!! if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) """ with open(resfp, "a", encoding="UTF-8") as restore : for gen_str in hypo_strings: restore.write(gen_str+" \n") restore.close() with open(resfp_, "a", encoding="UTF-8") as res_tore : for src_str in src_strings: res_tore.write(src_str+" \n") res_tore.close()""" with open(resfp, "w", encoding="UTF-8") as restore: json.dump(resdict, restore, ensure_ascii=False, indent=4) restore.close() return scorer
def forward( self, sample, forward_model, forward_optimizer, tgt_dict, backward_model, backward_optimizer, src_dict, lm_scorer=None, reduce=True, **generate_kwargs, ): """Compute the reconstruction and LM loss from forward and backward models. Args: sample: original input. hypos: psudo labels generated by the forward model. They are used as approximation of the target space to do importance sampling. forward_model: the model used to generate psuedo labels. backward_model: the model to reconstruct original input using psuedo labels. lm_scorer: an LM model eval mode to score psuedo labels in target space. """ # Generate translations nbest_translations = self._generate_translation( forward_model, tgt_dict, sample, self.args.beam, **generate_kwargs) forward_samples = [] backward_samples = {} # TODO (T36875783): load pretrained lm to score lm_score = 0.0 for sample_id, src_processed, tgt_hypos in nbest_translations: # compute each model's reward forward_reward = lm_score # construct the sample; compute the ce loss # backward_samples need to handle EOS src = self._maybe_reverse_source(src_processed) src = self._maybe_add_eos(src, src_dict.eos()) assert len(tgt_hypos) == self.args.beam for tgt_hypo_i, tgt_hypo_struct in enumerate(tgt_hypos): dual_sample_id = sample_id.item() * self.args.beam + tgt_hypo_i tgt_hypo = tgt_hypo_struct["tokens"] # add EOS to the target, i.e. original source, since it'll be used # as target # remove EOS in the src is optional if self.remove_eos_at_src: tgt_hypo = tgt_hypo[:-1] tgt_hypo_processed = self._maybe_reverse_source(tgt_hypo) backward_sample = { "id": dual_sample_id, "source": tgt_hypo_processed.cpu(), "target": src.cpu(), "weight": 1.0 - self.alpha, } assert dual_sample_id not in backward_samples backward_samples[dual_sample_id] = backward_sample bwd_model_input = utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=list(backward_samples.values()), pad_idx=src_dict.pad(), eos_idx=src_dict.eos(), )) reconstructed_source = self._generate_translation( backward_model, src_dict, bwd_model_input, 1, **generate_kwargs) for dual_sample_id, tgt_hypo_processed, src_hypos in reconstructed_source: backward_sample = backward_samples[dual_sample_id.item()] src = backward_sample["target"] tgt_hypo = self._maybe_reverse_source(tgt_hypo_processed) # use bleu score as reward scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(), src_dict.unk()) assert len(src_hypos) == 1 src_hypo = src_hypos[0]["tokens"][:-1] scorer.add(src.int().cpu(), src_hypo.int().cpu()) backward_reward = ( scorer.score(order=self.args.reconstruction_bleu_order) / 100.0) original_stc = " ".join(src_dict[tid] for tid in src.tolist()) translated_stc = " ".join(tgt_dict[tid] for tid in tgt_hypo) recon_stc = " ".join(src_dict[tid] for tid in src_hypo.tolist()) if int(dual_sample_id / self.args.beam) % 100 == 0: print("--------") print( "original sentence:", original_stc.replace(self.args.source_bpe_end_marker, ""), ) print( "translated sentence:", translated_stc.replace(self.args.source_bpe_end_marker, ""), ) print( "reconstructed sentence:", recon_stc.replace(self.args.source_bpe_end_marker, ""), ) print("reward:", backward_reward) print("--------") total_reward = (self.alpha * forward_reward + (1.0 - self.alpha) * backward_reward) src_processed = self._maybe_reverse_source(src) tgt_hypo = self._maybe_add_eos(tgt_hypo, tgt_dict.eos()) forward_samples.append({ "id": dual_sample_id, "source": src_processed.cpu(), "target": tgt_hypo.cpu(), # first hypo is best hypo "weight": total_reward, }) # Now combine pseudo labelled examples to corresponding batch with # rewards factored to weighting of each task's loss agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {} forward_model.train() forward_loss, sample_size, logging_output = self.task.criterion( forward_model, utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=forward_samples, pad_idx=tgt_dict.pad(), eos_idx=tgt_dict.eos(), )), ) agg_loss += forward_loss.detach().item() agg_sample_size += sample_size agg_logging_output["primal"] = logging_output # grad would be further scaled when passed back to trainer, # which will do the update forward_optimizer.backward(forward_loss) backward_model.train() backward_loss, sample_size, logging_output = self.task.criterion( backward_model, bwd_model_input) agg_loss += backward_loss.data.item() agg_sample_size += sample_size agg_logging_output["dual"] = logging_output backward_optimizer.backward(backward_loss) return agg_loss, agg_sample_size, agg_logging_output
def _generate_score(models, args, task, dataset, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join( args.path.split(":")))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = get_eval_itr(args, models, task, dataset) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) for trans_info in _iter_translations(args, task, dataset, translations, align_dict): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id] = trans_info.best_hypo_tokens translation_samples.append( collections.OrderedDict({ "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, })) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryNumpyDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) if oracle_scorer is not None: print( f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}" ) return scorer, num_sentences, gen_timer, translation_samples
def random_search(scores_info_export_path, num_trials, report_oracle_bleu=False): with open(scores_info_export_path, "rb") as f: scores_info = pickle.load(f) dummy_task = DummyTask() if report_oracle_bleu: oracle_scorer = bleu.Scorer(vocab_constants.PAD_ID, vocab_constants.EOS_ID, vocab_constants.UNK_ID) for example in scores_info: smoothed_bleu = [] for hypo in example["hypos"]: eval_score = smoothed_sentence_bleu( dummy_task, torch.IntTensor(example["target_tokens"]), torch.IntTensor(hypo), ) smoothed_bleu.append(eval_score) best_hypo_ind = np.argmax(smoothed_bleu) example["best_hypo_ind"] = best_hypo_ind oracle_scorer.add( torch.IntTensor(example["target_tokens"]), torch.IntTensor(example["hypos"][best_hypo_ind]), ) print("oracle BLEU: ", oracle_scorer.score()) num_features = scores_info[0]["scores"].shape[1] assert all( example["scores"].shape[1] == num_features for example in scores_info), "All examples must have the same number of scores!" feature_weights = np.zeros(num_features) feature_weights[0] = 1 score = evaluate_weights(scores_info, feature_weights, length_penalty=1) print("base BLEU: ", score) best_score = score best_weights = feature_weights best_length_penalty = 0 nonzero_features = identify_nonzero_features(scores_info) for i in range(num_trials): feature_weights = np.zeros(num_features) random_weights = np.random.dirichlet(np.ones(nonzero_features.size)) feature_weights[nonzero_features] = random_weights length_penalty = 1.5 * np.random.random() score = evaluate_weights(scores_info, feature_weights, length_penalty) if score > best_score: best_score = score best_weights = feature_weights best_length_penalty = length_penalty print(f"\r[{i}] best: {best_score}", end="", flush=True) print() print("best weights: ", best_weights) print("best length penalty: ", length_penalty) return best_weights, best_length_penalty, best_score
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, write_hypos, normalize): print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c) gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files( args) dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) ordered_hypos = {} ordered_targets = {} for shard_id in range(len(bitext1_lst)): bitext1 = bitext1_lst[shard_id] bitext2 = bitext2_lst[shard_id] gen_output = gen_output_lst[shard_id] lm_res = lm_res_lst[shard_id] total = len(bitext1.rescore_source.keys()) source_lst = [] hypo_lst = [] score_lst = [] reference_lst = [] j = 1 best_score = -math.inf for i in range(total): # length is measured in terms of words, not bpe tokens, since models may not share the same bpe target_len = len(bitext1.rescore_hypo[i].split()) if lm_res is not None: lm_score = lm_res.score[i] else: lm_score = 0 if bitext2 is not None: bitext2_score = bitext2.rescore_score[i] bitext2_backwards = bitext2.backwards else: bitext2_score = None bitext2_backwards = None score = rerank_utils.get_score(a, b, c, target_len, bitext1.rescore_score[i], bitext2_score, lm_score=lm_score, lenpen=lenpen, src_len=bitext1.source_lengths[i], tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards, bitext2_backwards=bitext2_backwards, normalize=normalize) if score > best_score: best_score = score best_hypo = bitext1.rescore_hypo[i] if j == gen_output.num_hypos[i] or j == args.num_rescore: j = 1 hypo_lst.append(best_hypo) score_lst.append(best_score) source_lst.append(bitext1.rescore_source[i]) reference_lst.append(bitext1.rescore_target[i]) best_score = -math.inf best_hypo = "" else: j += 1 gen_keys = list(sorted(gen_output.no_bpe_target.keys())) for key in range(len(gen_keys)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \ ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) + str(gen_output.no_bpe_hypo[key])) sys_tok = dict.encode_line(hypo_lst[key]) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) sys_tok = dict.encode_line(full_hypo) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) # if only one set of hyper parameters is provided, write the predictions to a file if write_hypos: # recover the orinal ids from n best list generation for key in range(len(gen_output.no_bpe_target)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \ "pred and rescore hypo mismatch:"+"i:"+str(key)+str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key]) ordered_hypos[gen_keys[key]] = hypo_lst[key] ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) ordered_hypos[gen_keys[key]] = full_hypo ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] # write the hypos in the original order from nbest list generation if args.num_shards == (len(bitext1_lst)): with open(target_outfile, 'w') as t: with open(hypo_outfile, 'w') as h: for key in range(len(ordered_hypos)): t.write(ordered_targets[key]) h.write(ordered_hypos[key]) res = scorer.result_string(4) if write_hypos: print(res) score = rerank_utils.parse_bleu_scoring(res) return score
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu use_ctc_loss = True if args.criterion == 'ctc_loss' else False # Setup task, e.g., image captioning task = tasks.setup_task(args) # Load dataset split task.load_dataset(args.gen_subset, combine=True, epoch=0) # Load ensemble print('| loading model(s) from {}'.format(args.path)) model_paths = args.path.split(':') models, _model_args = checkpoint_utils.load_model_ensemble( model_paths, arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) stats = collections.OrderedDict() num_sentences = 0 num_correct = 0 has_target = True with progress_bar.build_progress_bar( args, itr, prefix='inference on \'{}\' subset'.format(args.gen_subset), no_progress_bar='simple', ) as progress: wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = task.inference_step(generator, models, sample) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None target_tokens = None if has_target: if use_ctc_loss: target_tokens = sample['target'][i] target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) else: # Remove padding target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Regenerate original sentences from tokens. target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if has_target: print('\nT-{}\t{}'.format(sample_id, target_str)) # Process top predictions hypo = hypos[i][0] hypo_tokens = hypo['tokens'] if use_ctc_loss else hypo['tokens'].int().cpu() hypo_str = tgt_dict.string(hypo_tokens, args.remove_bpe, escape_unk=True) alignment = hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None if hypo_str == target_str: num_correct += 1 if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo_str, hypo['score'])) print('P-{}\t{}'.format( sample_id, ' '.join(map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )) if not use_ctc_loss else None )) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # Score only the top hypothesis if has_target: if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) num_sentences += sample['nsentences'] stats['wps'] = round(wps_meter.avg) stats['acc'] = num_correct / num_sentences progress.log(stats, tag='accuracy') progress.print(stats, tag='accuracy') print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) num_sentences = 0 if args.buffer_size > 1: logger.info('Sentence buffer size: %s', args.buffer_size) logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info('Type the input sentence and press return:') start_id = 0 for inputs in buffered_read(args.input, args.buffer_size): results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths tgt_tokens = batch.tgt_tokens num_sentences += src_tokens[0].size(0) if use_cuda: if isinstance(src_tokens, list): src_tokens = [tokens.cuda() for tokens in src_tokens] src_lengths = [lengths.cuda() for lengths in src_lengths] else: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, 'target': tgt_tokens, } gen_timer.start() translations = task.inference_step(generator, models, sample) num_generated_tokens = sum( len(h[0]['tokens']) for h in translations) gen_timer.stop(num_generated_tokens) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) tgt_tokens_i = None if tgt_tokens is not None: tgt_tokens_i = utils.strip_pad(tgt_tokens[i, :], tgt_dict.pad()).int().cpu() results.append( (start_id + id, src_tokens_i, hypos, tgt_tokens_i)) # sort output to match input order for id, src_tokens, hypos, tgt_tokens in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) if tgt_tokens is not None: tgt_str = tgt_dict.string(tgt_tokens, args.remove_bpe, escape_unk=True) print('T-{}\t{}'.format(id, tgt_str)) # Process top predictions for j, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = decode_fn(hypo_str) score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(id, score, hypo_str)) print('P-{}\t{}'.format( id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), )))) if args.print_alignment: alignment_str = " ".join( ["{}-{}".format(src, tgt) for src, tgt in alignment]) print('A-{}\t{}'.format(id, alignment_str)) if args.print_step: print('I-{}\t{}'.format(id, hypo['steps'])) print('O-{}\t{}'.format(id, hypo['num_ops'])) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(id, step, h_str)) # Score only the top hypothesis if tgt_tokens is not None and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE tgt_tokens = tgt_dict.encode_line( tgt_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(tgt_str, hypo_str) else: scorer.add(tgt_tokens, hypo_tokens) # update running id counter start_id += len(inputs) logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.has_target: logger.info('Generate with beam={}: {}'.format(args.beam, scorer.result_string()))
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset, args=args) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary args.unk_idx = task.src_dict.indices['<unk>'] args.dict_len = task.src_dict.indices.__len__() if '[APPEND]' in task.src_dict.indices.keys(): args.APPEND_ID = task.src_dict.indices['[APPEND]'] print("[APPEND] ID: {}".format(args.APPEND_ID)) else: args.APPEND_ID = -1 if '[SRC]' in task.src_dict.indices.keys(): args.SRC_ID = task.src_dict.indices['[SRC]'] print("[SRC] ID: {}".format(args.SRC_ID)) else: args.SRC_ID = -1 if '[TGT]' in task.src_dict.indices.keys(): args.TGT_ID = task.src_dict.indices['[TGT]'] print("[TGT] ID: {}".format(args.TGT_ID)) else: args.TGT_ID = -1 if '[SEP]' in task.src_dict.indices.keys(): args.SEP_ID = task.src_dict.indices['[SEP]'] print("[SEP] ID: {}".format(args.SEP_ID)) else: args.SEP_ID = -1 if '</s>' in task.src_dict.indices.keys(): args.EOS_ID = task.src_dict.indices['</s>'] else: args.EOD_ID = -1 if '<pad>' in task.src_dict.indices.keys(): args.PAD_ID = task.src_dict.indices['<pad>'] else: args.PAD_ID = -1 # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) _model_args.avgpen = args.avgpen task.datasets['test'].args = _model_args # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True select_retrieve_tokens = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() trans_results = [] for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos, encoder_outs = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens, retrieve_source_tokens, retrieve_target_tokens = sample[ 'net_input']['src_tokens'] retrieve_tokens = list( itertools.chain.from_iterable( zip(retrieve_source_tokens, retrieve_target_tokens))) retrieve_tokens = torch.cat(retrieve_tokens, dim=1) all_tokens = torch.cat([src_tokens, retrieve_tokens], dim=1) src_tokens = utils.strip_pad(all_tokens[i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # add select tokens select_retrieve_tokens.append([ sample_id, src_str, target_str, sample['predict_ground_truth'][i, :], retrieve_tokens[i, :], encoder_outs[0]['new_retrieve_tokens'][i, :], utils.strip_pad(retrieve_tokens[i, :], src_dict.pad()).tolist(), utils.strip_pad( encoder_outs[0]['new_retrieve_tokens'][i, :], src_dict.pad()).tolist() ]) # Process top predictions for i, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) trans_results.append((sample_id, hypo_str)) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) trans_results.sort(key=lambda key: key[0]) print("saving translation result to {}...".format(args.output)) with open(args.output, "w", encoding="utf-8") as w: for item in trans_results: w.write("{}\n".format(item[1].replace("<<unk>>", ""))) select_retrieve_tokens.sort(key=lambda key: key[0]) orig_retrieve_tokens_length = 0 select_retrieve_tokens_length = 0 correct_tokens = 0 with open(args.output + ".select", "w", encoding="utf-8") as w_select: for item in select_retrieve_tokens: sample_id, src_str, target_str, sample_predict_ground_truth, sample_orig_id, sample_select_retrieve_id, sample_orig_retrieve_tokens, sample_select_retrieve_tokens = item retrieve_str = src_dict.string(sample_orig_retrieve_tokens, args.remove_bpe) select_str = src_dict.string(sample_select_retrieve_tokens, args.remove_bpe) w_select.write("{}\n{}\n{}\n{}\n\n".format(src_str, target_str, retrieve_str, select_str)) orig_retrieve_tokens_length += len(sample_orig_retrieve_tokens) select_retrieve_tokens_length += len(sample_select_retrieve_tokens) #calculate accuracy correct_tokens += ( (sample_select_retrieve_id != _model_args.PAD_ID ).long() == sample_predict_ground_truth).masked_fill( (sample_orig_id == _model_args.PAD_ID).byte(), 0).sum() ratio = select_retrieve_tokens_length / float(orig_retrieve_tokens_length) accuracy = correct_tokens.tolist() / float(orig_retrieve_tokens_length) print("Selective Tokens: {}".format(ratio)) print("Correct Tokens: {}".format(accuracy)) with open("{}.RetrieveNMT.BLEU".format(args.output), "a", encoding="utf-8") as w: w.write( '{}->{}: Generate {} with beam={} and lenpen={}: {};\tSelection Ratio: {};\tAccuracy:{}\n' .format(args.source_lang, args.target_lang, args.gen_subset, args.beam, args.lenpen, scorer.result_string(), ratio, accuracy)) return scorer
def __init__(self, args, src_dict, dst_dict): super().__init__(args, src_dict, dst_dict) self.translator = None self.scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument( '--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') options.add_generation_args(parser) args = parser.parse_args() if args.no_progress_bar and args.log_format is None: args.log_format = 'none' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset if args.replace_unk is None: dataset = data.load_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst # Load ensemble print('| loading model(s) from {}'.format(', '.join(args.path))) models, _ = utils.load_ensemble_for_inference(args.path, dataset.src_dict, dataset.dst_dict) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator translator = SequenceGenerator(models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader(args.gen_subset, max_sentences=args.batch_size, max_positions=max_positions, skip_invalid_size_inputs_valid_test=args. skip_invalid_size_inputs_valid_test) num_sentences = 0 with utils.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda_device=0 if use_cuda else None, timer=gen_timer) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[ args.gen_subset].src.get_original_text(sample_id) target_str = dataset.splits[ args.gen_subset].dst.get_original_text(sample_id) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('A-{}\t{}'.format(sample_id, ' '.join(map(str, alignment)))) # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, dataset.dst_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'. format(num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
def score(args, trainer, task, epoch_itr, subset): begin = time.time() if not subset in task.datasets.keys(): task.load_dataset(subset) src_dict = deepcopy(task.source_dictionary ) # This is necessary, generation of translations tgt_dict = deepcopy( task.target_dictionary ) # alters target dictionary messing up with the rest of training model = trainer.get_model() # Initialize data iterator itr = data.EpochBatchIterator( dataset=task.dataset(subset), max_tokens=None, max_sentences=max( 8, min(math.ceil(1024 / args.distributed_world_size), 128)), max_positions=model.max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() translator = SequenceGenerator( [model], tgt_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) # Generate and compute BLEU dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) num_sentences = 0 has_target = True predictions = [] with progress_bar.build_progress_bar(args, itr) as progress: translations = translator.generate_batched_itr( progress, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=True, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and grount truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe) # Score only the top hypothesis if has_target and i == 0: if args.sentencepiece: hypo_str = hypo_str.replace(' ', '').replace('▁', ' ') target_str = target_str.replace(' ', '').replace('▁', ' ') sys_tok = tokenizer.Tokenizer.tokenize( (hypo_str.lower() if args.ignore_case else hypo_str), dict) ref_tok = tokenizer.Tokenizer.tokenize( (target_str.lower() if args.ignore_case else target_str), dict) scorer.add(ref_tok, sys_tok) if not args.sentencepiece: hypo_str = tokenizer.Tokenizer.detokenize( hypo_str, 'de') predictions.append('{}\t{}'.format(sample_id, hypo_str)) wps_meter.update(src_tokens.size(0)) progress.log({'wps': round(wps_meter.avg)}) num_sentences += 1 if args.distributed_world_size > 1: _all_gather_bleu_scorer(scorer) predictions = _all_gather_predictions(predictions) with open(os.path.join(args.data, 'sacrebleu_reference.de'), 'r') as reference: refs = [reference.readlines()] #reducing indexed predictions as strings is more memory efficient than reducing tuples predictions = [tuple(item.split('\t')) for item in predictions] predictions = [(int(item[0]), item[1]) for item in predictions] predictions.sort(key=lambda tup: tup[0]) predictions = [ hypo[1] + ('\n' if hypo[1][-1] != '\n' else '') for hypo in predictions ] sacrebleu_score = sacrebleu.corpus_bleu(predictions, refs, lowercase=args.ignore_case) print(f'|Detokenized {sacrebleu_score}') if gen_timer.sum != 0: print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(subset, args.beam, scorer.result_string())) print('| Eval completed in: {:.2f}s'.format(time.time() - begin)) return scorer.score(order=4), sacrebleu_score.score
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # debug: ahmed def quantize(data, n, max_value=1): scale = ((2**(n) - 1) / 2) / torch.max(torch.abs(data)) # adaptive max #scale = ((2**(n)-1)/2)/max_value # static max (predetermined) return torch.round(scale * data) / scale # quantize model layer by layer to n-bit #print("#########################################") for name, param in model.named_parameters(): if param.requires_grad and ('weight' in name): layer = 'model.' + name #fileName = 'model_wmt14.weights.layers' fileName = 'model_iwslt14.tokenized.de-en.weights.layers' with open(fileName) as f: layersList = f.readlines() layersNamesList = [layerName.rstrip('\n') for layerName in layersList] layer_max_dict = pickle.load(open("layer_max_dict.pkl", "rb")) n = 8 #PRANNOY (type=int) for layer in layersNamesList: print('----------') #print(model.encoder.layers[0].self_attn) print(layer) kernel = eval(layer) max_value = layer_max_dict[layer].item() kernel_q = quantize(kernel, n) # adaptive (on the fly) #kernel_q = quantize(kernel, 8, max_value) # static exec(layer + '=' + 'torch.nn.Parameter(kernel_q)') print(len((eval(layer)).unique())) """ # quantize model layer by layer to n-bit print("#########################################") #print(model.encoder.embed_tokens.weight.shape) fileName = 'model_print.keys.weights.layers' with open(fileName) as f: layersList = f.readlines() layersNamesList = [layerName.rstrip('\n') for layerName in layersList] for layer in layersNamesList: #print(vars(layer).shape) #print(model.encoder.embed_tokens.weight) #print(exec(layer)) #print(globals()[layer]) #print(eval(layer).shape) print('------------') print(layer) kernel = eval(layer) kernel_q = quantize(kernel) #eval(layer) = torch.nn.Parameter(kernel_q) exec(layer + '=' + 'torch.nn.Parameter(kernel_q)') print(len((eval(layer)).unique())) #print(model) #kernel = model.decoder.layers[3].fc1.weight #print(kernel.shape) #print(torch.max(torch.abs(kernel))) #print(kernel[0][0:3]) #print(len(set(model.decoder.layers[3].fc1.weight))) #kernel_q = quantize(kernel) #print(kernel_q[0][0:3]) #model.decoder.layers[3].fc1.weight = torch.nn.Parameter(kernel_q) #print(len((model.decoder.layers[3].fc1.weight).unique())) print("#########################################") """ # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True, extra_symbols_to_ignore={ generator.eos, }) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, extra_symbols_to_ignore={ generator.eos, }) detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) # detokenized hypothesis print('D-{}\t{}\t{}'.format(sample_id, score, detok_hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: if args.bpe and not args.sacrebleu: if args.remove_bpe: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) # ahmed: logging with open("infer_BLEU.txt", "a") as myfile: myfile.write(scorer.result_string()) myfile.write("\n") return scorer
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) print('O-{}\t{}'.format(sample_id, hypo['num_ops']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) return scorer
def _generate_score(models, args, task, dataset_split, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join(args.path))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_(beamable_mm_beam_size=None if args. no_beamable_mm else args.beam) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(task.dataset(dataset_split)) translated_scores = [0.0] * len(task.dataset(dataset_split)) # Generate and compute BLEU score dst_dict = task.target_dictionary scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = get_eval_itr(args, models, task, dataset_split) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() # Keep more detailed timing when invoked from benchmark if "keep_detailed_timing" in args: gen_timer = pytorch_translate_utils.BucketStopwatchMeter( args.increment, args.max_length, args.samples_per_length) else: gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) if pytorch_translate_data.is_multilingual(args): first_best_translations = _iter_first_best_multilingual else: first_best_translations = _iter_first_best_bilingual for trans_info in first_best_translations(args, task, dataset_split, translations, align_dict): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score translation_samples.append( collections.OrderedDict({ "sample_id": trans_info.sample_id, "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, })) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) return scorer, num_sentences, gen_timer, translation_samples
def generate_main(args, model): models = [model] result_writer = open(args.results_path, "w", encoding="utf-8") #assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 #print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble #print('| loading model(s) from {}'.format(args.path)) #models, _model_args = checkpoint_utils.load_model_ensemble( # args.path.split(':'), # arg_overrides=eval(args.model_overrides), # task=task, #) # Optimize ensemble for generation #for model in models: # model.make_generation_fast_( # beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, # need_attn=args.print_alignment, # ) # if args.fp16: # model.half() # if use_cuda: # model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=result_writer) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=result_writer) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str), file=result_writer) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), ))), file=result_writer) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment))), file=result_writer) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=os.environ.get('LOGLEVEL', 'INFO').upper(), stream=sys.stdout, ) logger = logging.getLogger('fairseq_cli.generate') assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 #print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) mose_de = MosesDetokenizer(lang='en') if args.bert_model_path: bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model_path) bert_model = BertModel.from_pretrained(args.bert_model_path) bert_model.cuda() bert_model.eval() # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) batch_len = len(task.dataset(args.gen_subset)) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) #Bert model #bert_model = bert_as_lm.Bert_score(torch.cuda.current_device()) #mose_de = MosesDetokenizer(lang='en') # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True total_pearson = [] total_bert_pearson = [] random_list = [] bert_bleu_equal = 0 sents_num = 0 if args.gen_subset == 'train': random_list = [i for i in range(0, batch_len)] random.shuffle(random_list) random_list = random_list[:1000] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: selected = [] if random_list: for i in sample['id']: selected.append( True) if i in random_list else selected.append(False) selected = torch.nonzero( torch.tensor(selected).ne(0)).squeeze(-1) if len(selected) == 0: continue for item in sample.keys(): if item == 'nsentences' or item == 'ntokens': continue elif item == 'net_input': for input in sample[item].keys(): sample[item][input] = sample[item][ input].index_select(0, selected) else: sample[item] = sample[item].index_select(0, selected) sample['nsentences'] = len(selected) sample['ntokens'] = torch.LongTensor([ s.ne(2).long().sum() for s in sample['target'] ]).sum().item() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): sents_num += 1 if random_list and sample_id not in random_list: continue has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print("---------------{}--------------".format(sents_num)) if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=sys.stdout) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=sys.stdout) # Process top predictions probs = [] bleu_score = [] cands = [] sents_bert_score = [] detoken_cands = [] temp_cand_tokens = [] for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log( 2) # convert to base 2 probs.append(score) single_score = sacrebleu.corpus_bleu( [hypo_str], [[target_str]], use_effective_order=True, tokenize="none") bleu_score.append(single_score.score) cands.append(hypo_str) temp_cand_tokens.append(hypo['tokens'].cpu()) hypo_splittokens = hypo_str.split(' ') detoken_hypo = mose_de.detokenize(hypo_splittokens) detoken_cands.append(detoken_hypo) #print (cands) #get the decoded_out current_src_length = sample['net_input']['src_lengths'][i] current_src_tokens = sample['net_input']['src_tokens'][i, :] decoded_out = get_decoded_out(current_src_tokens, current_src_length, temp_cand_tokens, models) #print (decoded_out.shape,"decoded") #get the encoded_out of bert encoded_input = bert_tokenizer(detoken_cands, return_tensors='pt', padding=True) for key in encoded_input: encoded_input[key] = encoded_input[key].cuda() bert_output = bert_model(**encoded_input)[0] #print (bert_output.shape) encoded_out = bert_output.mean(dim=1) #print ("encoded_out",encoded_out.shape) net_out = models[0].decoder.through_ffnet( torch.cat((encoded_out, decoded_out), 1)) #[beam, seq_len, 1] #print ("net_out", net_out.shape) sent_logits = torch.nn.functional.softmax(net_out.view( -1, args.nbest), dim=1) #print (sent_logits) net_pos = torch.argmax(sent_logits).item() #print (net_pos) pearson = 0 #if args.nbest > 20: np_prob = np.array(probs) np_bleu = np.array(bleu_score) pearson = np.corrcoef(np_prob, np_bleu)[0][1] if not np.isnan(pearson): total_pearson.append(pearson) #else: # print ("cands:", cands, file=sys.stdout) # print ("probs:", np_prob, file=sys.stdout) # print ("bleus:", np_bleu, file=sys.stdout) bleu_pos = bleu_score.index(max(bleu_score)) print("-----bleu choice: {} bleu:{:.3f} pos: {}".format( cands[bleu_pos], bleu_score[bleu_pos], bleu_pos + 1), file=sys.stdout) pos = probs.index(max(probs)) print("-----prob choice: {} bleu:{:.3f} pos: {}".format( cands[pos], bleu_score[pos], pos + 1), file=sys.stdout) print("-----net choice: {} bleu:{:.3f} pos: {}".format( cands[net_pos], bleu_score[net_pos], net_pos + 1), file=sys.stdout) ''' np_bert = np.array(sents_bert_score) bert_bleu_pearson = np.corrcoef(np_bert, np_bleu)[0][1] if not np.isnan(bert_bleu_pearson): total_bert_pearson.append(bert_bleu_pearson) bert_pos = sents_bert_score.index(min(sents_bert_score)) print('*****{} bert choice: {}\tprob:{:.3f}\tbleu:{:.3f}\tbertscore:{:.3f}\tposition:{}\tprob_bleu_pearson:{:.3f} bert_bleu_p: {:.3f} '. \ format(sample_id, cands[bert_pos], probs[bert_pos], bleu_score[bert_pos], sents_bert_score[bert_pos], bert_pos+1, pearson, bert_bleu_pearson), file=sys.stdout) ''' if args.usebleu: final_hypo = cands[bleu_pos] elif args.usebert: final_hypo = cands[net_pos] else: final_hypo = cands[pos] scorer.add_string(target_str, final_hypo) print('H choice use bleu: {} usebert: {}'.format( args.usebleu, args.usebert)) if has_target and sents_num % 800 == 0: print('Generate {} with beam={}: {}\t{}'.format( args.gen_subset, args.beam, scorer.result_string(), sents_num, file=sys.stdout)) wps_meter.update(num_generated_tokens) #t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print( 'Generate {} with beam={}: {}\n --- prob&bleu pearson: {:.4f} ---'. format(args.gen_subset, args.beam, scorer.result_string(), sum(total_pearson) / len(total_pearson)), file=sys.stdout) return scorer
def validate_translation(args, trainer, task, epoch_itr, generator): src_dict = task.source_dictionary tgt_dict = task.target_dictionary models = [trainer.get_model()] if hasattr(task, 'eval_lang_pairs'): bleu_dict = {key: None for key in task.eval_lang_pairs} # Generate and compute BLEU score if args.sacrebleu: scorer_dict = { key: bleu.SacrebleuScorer() for key in task.eval_lang_pairs } else: scorer_dict = { key: bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) for key in task.eval_lang_pairs } itr = task.get_batch_iterator( dataset=task.dataset('valid'), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, noskip=True, )[0].next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar(args, itr, epoch_itr.epoch, prefix='translate subset', no_progress_bar='simple') num_sentences = 0 has_target = True #with progress_bar.build_progress_bar(args, itr) as t: for samples in progress: if torch.cuda.is_available() and not args.cpu: samples = utils.move_to_cuda(samples) #if 'net_input' not in samples: # continue prefix_tokens = None for key, sample in samples.items(): hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Remove padding if args.sde: src_tokens = target_tokens else: src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) # Either retrieve the original sentences or regenerate them from tokens. #if src_dict is not None: # src_str = src_dict.string(src_tokens, args.remove_bpe) #else: # src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) #if not args.quiet: # if src_dict is not None: # print('S-{}\t{}'.format(sample_id, src_str)) # if has_target: # print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str="", alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) #if not args.quiet: # print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) # print('P-{}\t{}'.format( # sample_id, # ' '.join(map( # lambda x: '{:.4f}'.format(x), # hypo['positional_scores'].tolist(), # )) # )) # if args.print_alignment: # print('A-{}\t{}'.format( # sample_id, # ' '.join(map(lambda x: str(utils.item(x)), alignment)) # )) #print(has_target, j, hypo_str) # Score only the top hypothesis if has_target and j == 0: if args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer_dict[key], 'add_string'): scorer_dict[key].add_string( target_str, hypo_str) else: scorer_dict[key].add(target_tokens, hypo_tokens) num_sentences += sample['nsentences'] print("|valid tranlsated {} sentences".format(num_sentences)) for key, scorer in scorer_dict.items(): bleu_dict[key] = scorer.score() else: bleu_dict = {0: None} # Generate and compute BLEU score if args.sacrebleu: scorer_dict = {0: bleu.SacrebleuScorer()} else: scorer_dict = { 0: bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) } itr = task.get_batch_iterator( dataset=task.dataset('valid'), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, noskip=True, )[0].next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar(args, itr, epoch_itr.epoch, prefix='translate subset', no_progress_bar='simple') num_sentences = 0 has_target = True #with progress_bar.build_progress_bar(args, itr) as t: for samples in progress: if torch.cuda.is_available() and not args.cpu: samples = utils.move_to_cuda(samples) #if 'net_input' not in samples: # continue prefix_tokens = None sample = samples hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Remove padding if args.sde: src_tokens = target_tokens else: src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) # Either retrieve the original sentences or regenerate them from tokens. #if src_dict is not None: # src_str = src_dict.string(src_tokens, args.remove_bpe) #else: # src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) #if not args.quiet: # if src_dict is not None: # print('S-{}\t{}'.format(sample_id, src_str)) # if has_target: # print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str="", alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) #if not args.quiet: # print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) # print('P-{}\t{}'.format( # sample_id, # ' '.join(map( # lambda x: '{:.4f}'.format(x), # hypo['positional_scores'].tolist(), # )) # )) # if args.print_alignment: # print('A-{}\t{}'.format( # sample_id, # ' '.join(map(lambda x: str(utils.item(x)), alignment)) # )) #print(has_target, j, hypo_str) # Score only the top hypothesis if has_target and j == 0: if args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer_dict[0], 'add_string'): scorer_dict[0].add_string(target_str, hypo_str) else: scorer_dict[0].add(target_tokens, hypo_tokens) num_sentences += sample['nsentences'] print("|valid tranlsated {} sentences".format(num_sentences)) for key, scorer in scorer_dict.items(): bleu_dict[key] = scorer.score() return bleu_dict
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.dataset_impl == 'raw', \ '--replace-unk requires a raw text dataset (--dataset-impl=raw)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=args.model_overrides, task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() if args.decoding_path is not None: src_sents = [[] for _ in range(1000000)] tgt_sents = [[] for _ in range(1000000)] hyp_sents = [[] for _ in range(1000000)] for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ]))) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps'])) if getattr(args, 'retain_iter_history', False): print("\n".join([ 'E-{}_{}\t{}'.format( sample_id, step, utils.post_process_prediction( h['tokens'].int().cpu(), src_str, None, None, tgt_dict, None)[1]) for step, h in enumerate(hypo['history']) ])) if args.decoding_path is not None: src_sents[int(sample_id)].append(src_str) tgt_sents[int(sample_id)].append(target_str) hyp_sents[int(sample_id)].append(hypo_str) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) if args.decoding_path is not None: with open(os.path.join(args.decoding_path, 'source.txt'), 'w', encoding='utf-8') as f: for sents in src_sents: if len(sents) == 0: continue for sent in sents: f.write(sent + '\n') with open(os.path.join(args.decoding_path, 'target.txt'), 'w', encoding='utf-8') as f: for sents in tgt_sents: if len(sents) == 0: continue for sent in sents: f.write(sent + '\n') with open(os.path.join(args.decoding_path, 'decoding.txt'), 'w', encoding='utf-8') as f: for sents in hyp_sents: if len(sents) == 0: continue for sent in sents: f.write(sent + '\n') if len(list(args.num_ref.values())) == 1: num_ref = int(list(args.num_ref.values())[0]) else: raise NotImplementedError ref_path = [] if num_ref == 1: ref_path.append( os.path.join(args.valid_decoding_path, args.gen_subset + '.tok.' + args.target_lang)) else: for i in range(num_ref): ref_path.append( os.path.join( args.valid_decoding_path, args.gen_subset + '.tok.' + args.target_lang + str(i))) decoding_path = os.path.join(args.decoding_path, 'decoding.txt') #with open(decoding_path) as out_file: # out_file.seek(0) # subprocess.call( # 'perl %s/multi-bleu.perl %s' % (args.multi_bleu_path, ' '.join(ref_path)), # stdin=out_file, shell=True) return scorer
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval( args.model_overrides)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) if args.fp16: model.half() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = data.EpochBatchIterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=models[0].max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() if args.score_reference: translator = SequenceScorer(models, task.target_dictionary) else: translator = SequenceGenerator( models, task.target_dictionary, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) if use_cuda: translator.cuda() # Generate and compute BLEU score scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: if args.score_reference: translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer) else: translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
def forward( self, sample, forward_model, forward_optimizer, tgt_dict, backward_model, backward_optimizer, src_dict, lm_scorer=None, reduce=True, **generate_kwargs, ): """Compute the reconstruction and LM loss from forward and backward models. Args: sample: original input. hypos: psudo labels generated by the forward model. They are used as approximation of the target space to do importance sampling. forward_model: the model used to generate psuedo labels. backward_model: the model to reconstruct original input using psuedo labels. lm_scorer: an LM model eval mode to score psuedo labels in target space. """ # Generate translations nbest_translations = self._generate_translation( forward_model, tgt_dict, sample, **generate_kwargs) forward_samples = [] backward_samples = [] # TODO (T36875783): load pretrained lm to score lm_score = 0.5 eos_index = tgt_dict.eos() for id, src, hypos in nbest_translations: # compute each model's reward forward_reward = lm_score # construct the sample; compute the ce loss # backward_samples need to handle EOS original_src = src bt_src = hypos[0]["tokens"] # add EOS to the target, i.e. original source, since it'll be used # as target if original_src[-1] != eos_index: original_src = torch.cat( [original_src.cpu(), torch.LongTensor([eos_index])]) # remove EOS in the src is optional if self.remove_eos_at_src: bt_src = bt_src[:-1] backward_sample = { "id": id, "source": bt_src.cpu(), # first hypo is best hypo "target": original_src.cpu(), "weight": 1.0 - self.alpha, } backward_samples.append(backward_sample) # use bleu score as reward bwd_model_input = utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=[backward_sample], pad_idx=src_dict.pad(), eos_idx=src_dict.eos(), )) reconstructed_source = self._generate_translation( backward_model, src_dict, bwd_model_input, **generate_kwargs) scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(), src_dict.unk()) for _, _, x_hypos in reconstructed_source: x_hat = x_hypos[0]["tokens"][:-1] scorer.add(original_src.int().cpu(), x_hat.int().cpu()) backward_reward = scorer.score(order=4) / 100.0 total_reward = (self.alpha * forward_reward + (1.0 - self.alpha) * backward_reward) assert hypos[0]["tokens"][-1] == eos_index, ( f"Expected generated translation to have eos (id: " f"{eos_index}) at end, but instead found token id " f"{hypos[0]['tokens'][-1]} at end.") forward_samples.append({ "id": id, "source": src.cpu(), "target": hypos[0]["tokens"].cpu(), # first hypo is best hypo "weight": total_reward, }) # Now combine pseudo labelled examples to corresponding batch with # rewards factored to weighting of each task's loss agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {} forward_model.train() forward_loss, sample_size, logging_output = self.task.criterion( forward_model, utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=forward_samples, pad_idx=tgt_dict.pad(), eos_idx=tgt_dict.eos(), )), ) agg_loss += forward_loss.detach().item() agg_sample_size += sample_size agg_logging_output["primal"] = logging_output # grad would be further scaled when passed back to trainer, # which will do the update forward_optimizer.backward(forward_loss) backward_model.train() backward_loss, sample_size, logging_output = self.task.criterion( backward_model, utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=backward_samples, pad_idx=src_dict.pad(), eos_idx=src_dict.eos(), )), ) agg_loss += backward_loss.data.item() agg_sample_size += sample_size agg_logging_output["dual"] = logging_output backward_optimizer.backward(backward_loss) return agg_loss, agg_sample_size, agg_logging_output