def test_combine_weighted_scores(self): test_args = test_utils.ModelParamsDict() test_args.enable_rescoring = True test_args.length_penalty = 1 test_args.original_model_weight = 1 test_args.l2r_model_path = "" test_args.l2r_model_weight = 1 test_args.r2l_model_weight = 0 test_args.reverse_model_weight = 0.5 test_args.lm_model_weight = 0.75 test_args.length_penalty = 1 _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) model = task.build_model(test_args) with patch( "pytorch_translate.utils.load_diverse_ensemble_for_inference", return_value=([model], test_args, task), ): rescorer = Rescorer(test_args) scores = torch.tensor([[10, 20, 30, 40]], dtype=torch.float) src_tokens = torch.tensor([1, 2, 3, 4, 5]) hypos = [{"tokens": torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}] rescorer.combine_weighted_scores(scores, src_tokens, hypos) # 10=1. , 20*0=0. , 30*(0.5/5)=3. , 40*(0.75/5)=6. expected = torch.tensor([[10.0, 0.0, 3.0, 6.0]], dtype=torch.float) assert torch.equal(scores, expected)
def test_model_passing_as_parameter(self): test_args = test_utils.ModelParamsDict("transformer") test_args.enable_rescoring = True test_args.length_penalty = 1 test_args.l2r_model_weight = 1.0 test_args.r2l_model_weight = 0.0 test_args.reverse_model_weight = 0.0 test_args.lm_model_weight = 1.01 test_args.cloze_transformer_weight = 1.0 test_args.length_penalty = 1.0 _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) model = task.build_model(test_args) src_tokens = torch.tensor([[1, 2, 3, 4, 5]]) hypos = [{"tokens": torch.tensor([1, 2])}, {"tokens": torch.tensor([1, 2])}] rescorer = Rescorer( test_args, task, {"l2r_model": {"model": model, "task": task}} ) scores = rescorer.score(src_tokens, hypos) assert scores.size()[1] == 5
def test_batch_computation(self): test_args = test_utils.ModelParamsDict("transformer") test_args.enable_rescoring = True test_args.length_penalty = 1 test_args.l2r_model_path = "/tmp/test_rescorer_model.pt" test_args.l2r_model_weight = 1.0 test_args.r2l_model_weight = 0.0 test_args.reverse_model_weight = 0.0 test_args.cloze_transformer_weight = 1.0 test_args.lm_model_weight = 0.0 test_args.length_penalty = 1.0 _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) model = task.build_model(test_args) torch.save(model, test_args.l2r_model_path) with patch( "pytorch_translate.utils.load_diverse_ensemble_for_inference", return_value=([model], test_args, task), ): rescorer = Rescorer(test_args) src_tokens = torch.tensor([[1, 3, 3, 4, 2], [1, 3, 2, 0, 0]]) hypos = [ {"tokens": torch.tensor([1, 5, 2])}, {"tokens": torch.tensor([6, 3, 5, 2])}, {"tokens": torch.tensor([1, 2])}, {"tokens": torch.tensor([1, 5, 6, 2])}, ] scores = rescorer.score(src_tokens, hypos) src_tokens = torch.tensor([[1, 3, 3, 4, 2]]) hypos = [ {"tokens": torch.tensor([1, 5, 2])}, {"tokens": torch.tensor([6, 3, 5, 2])}, ] scores_single = rescorer.score(src_tokens, hypos) assert torch.equal(scores[0], scores_single[0])
def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ src_tokens = sample["net_input"]["src_tokens"] beam_size = self.args.rl_num_trajectory bsz, srclen = src_tokens.size() encoder_input = { "src_tokens": sample["net_input"]["src_tokens"], "src_lengths": sample["net_input"]["src_lengths"], } # 1) Generate hypos translator = generate.build_sequence_generator(self.args, self.task, [model]) with torch.no_grad(): seq_hypos = translator.generate( encoder_input, beam_size, maxlen=int(self.args.max_len_a * srclen + self.args.max_len_b), ) word_hypos = [[] for j in range(bsz)] for k in range(bsz): word_hypos[k] = [{"tokens": sample["target"][k]}] ## Mix sequence, word-level hypos hypos = [seq_hypos[j] + word_hypos[j] for j in range(bsz)] hypos = [hypo for _ in hypos for hypo in _] hypos_len = ( torch.tensor([len(hypo["tokens"]) for hypo in hypos]) .type_as(src_tokens) .float() ) # mask index for word-level hypos, e.g., target sentence mask_index = torch.arange(beam_size, (beam_size + 1) * bsz, beam_size + 1).view( -1 ) # 2) Compute (log)-probs via forward models self.self_rescorer.model = model self.self_rescorer.task = self.task model.train() assert self.self_rescorer.model.training, "model should be in training phase" hypo_encoder_inputs, hypo_tokens = self.self_rescorer.prepare_inputs( src_tokens, hypos ) hypo_logprobs, hypo_encoder_outs, forward_logprobs = self.self_rescorer.score_tokens( hypo_encoder_inputs, hypo_tokens ) hypo_logprobs /= hypos_len ** self.args.rescore_length_penalty # 3) Sequence level seq_loss = torch.zeros(1).type_as(hypo_logprobs) if self.args.rl_weight > 0.0: ## 3.1) Compute seq-level rewards with torch.no_grad(): rescorer = Rescorer(self.args, self.task, self.rescore_models) scores = rescorer.score(src_tokens, hypos) rewards = self.combine_score(src_tokens, hypos, hypos_len, scores) assert not rewards.requires_grad, "no grads flow back to generation" ## 3.2) Compute Policy Gradient loss rewards = rewards.type_as(hypo_logprobs) seq_mask = hypo_logprobs.new_ones(hypo_logprobs.size()) seq_mask[mask_index] = 0.0 seq_loss = -1.0 * (seq_mask * hypo_logprobs * rewards).sum() # 4) Word-level word_loss = torch.zeros(1).type_as(hypo_logprobs) if self.args.word_weight > 0.0: ## 4.1) Compute word-level rewards from a left-right rescoring model with torch.no_grad(): teacher_model = self.rescore_models[self.args.word_model] teacher = SimpleModelScorer(self.args, None, teacher_model, self.task) _, _, teacher_logprobs = teacher.score_tokens( hypo_encoder_inputs, hypo_tokens ) ## 4.2) Compute word-level loss f_logprob, f_index = forward_logprobs.topk(self.args.topk_words) word_mask = f_logprob.new_zeros(f_logprob.size()) word_mask[mask_index, :, :] = 1.0 ## KL(p_s || p_t) = \sum p_s log p_s - \sum p_s log p_t, aka RL + maxEnt word_loss = ( word_mask * f_logprob.exp() * (f_logprob - 1.0 * teacher_logprobs.gather(-1, f_index)) ).sum() # 5) Compute Cross-entropy loss eos = self.task.target_dictionary.eos() target_tokens = torch.cat( ( torch.zeros(bsz, 1).fill_(eos).type_as(sample["target"]), sample["target"], ), dim=1, ) target_encoder_inputs = ( encoder_input["src_tokens"], [encoder_input["src_lengths"][0].item()], ) target_logprobs, target_encoder_out, _ = self.self_rescorer.score_tokens( target_encoder_inputs, target_tokens ) nll_loss = -1.0 * target_logprobs.sum() # 6) Gather losses loss = ( self.args.rl_weight * seq_loss + self.args.word_weight * word_loss + nll_loss ) # Logging sample_size = ( sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output
def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ src_tokens = sample["net_input"]["src_tokens"] beam_size = self.args.rl_num_trajectory bsz, srclen = src_tokens.size() encoder_input = { "src_tokens": sample["net_input"]["src_tokens"], "src_lengths": sample["net_input"]["src_lengths"], } # 1) Generate hypos translator = generate.build_sequence_generator(self.args, self.task, [model]) with torch.no_grad(): hypos = translator.generate( encoder_input, beam_size, maxlen=int(self.args.max_len_a * srclen + self.args.max_len_b), ) ## flatten nested list hypos = [hypo for _ in hypos for hypo in _] # with length of bsz * beam_size hypos_len = (torch.tensor([len(hypo["tokens"]) for hypo in hypos ]).type_as(src_tokens).float()) # 2) Compute (log)-probs via forward models self.self_rescorer.model = model self.self_rescorer.task = self.task model.train() assert self.self_rescorer.model.training, "model should be in training phase" hypo_encoder_inputs, hypo_tokens = self.self_rescorer.prepare_inputs( src_tokens, hypos) hypo_logprobs, hypo_encoder_outs, _ = self.self_rescorer.score_tokens( hypo_encoder_inputs, hypo_tokens) hypo_logprobs /= hypos_len**self.args.rescore_length_penalty # 3) Compute rewards from rescoring models with torch.no_grad(): rescorer = Rescorer(self.args, self.task, self.rescore_models) scores = rescorer.score(src_tokens, hypos) rewards = self.combine_score(src_tokens, hypos, hypos_len, scores) assert not rewards.requires_grad, "no grads flow back to generation" # 4) Compute Policy Gradient loss rewards = rewards.type_as(hypo_logprobs) rl_loss = -1.0 * (hypo_logprobs * rewards).sum() # 5) Compute Cross-entropy loss eos = self.task.target_dictionary.eos() target_tokens = torch.cat( ( torch.zeros(bsz, 1).fill_(eos).type_as(sample["target"]), sample["target"], ), dim=1, ) target_encoder_inputs = ( encoder_input["src_tokens"], [encoder_input["src_lengths"][0].item()], ) target_logprobs, target_encoder_out, _ = self.self_rescorer.score_tokens( target_encoder_inputs, target_tokens) nll_loss = -1.0 * target_logprobs.sum() # 6) Gather losses loss = self.args.rl_weight * rl_loss + nll_loss # Logging sample_size = (sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]) logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output
def _generate_score(models, args, task, dataset, optimize=True): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print("| loading model(s) from {}".format(", ".join( args.path.split(":")))) # Optimize ensemble for generation if optimize: for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = get_eval_itr(args, models, task, dataset) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescorer = None rescoring_bleu_scorer = None if args.enable_rescoring: rescorer = Rescorer(args) rescoring_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) num_sentences = 0 translation_samples = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0, ) for trans_info in _iter_translations(args, task, dataset, translations, align_dict, rescorer): scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) if rescoring_bleu_scorer is not None: rescoring_bleu_scorer.add( trans_info.target_tokens, trans_info.hypo_tokens_after_rescoring) translated_sentences[trans_info.sample_id] = trans_info.hypo_str translated_scores[trans_info.sample_id] = trans_info.hypo_score if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id] = trans_info.best_hypo_tokens translation_samples.append( collections.OrderedDict({ "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, })) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryNumpyDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) # If applicable, save the translations to the output file # For eg. external evaluation if getattr(args, "translation_output_file", False): with open(args.translation_output_file, "w") as out_file: for hypo_str in translated_sentences: print(hypo_str, file=out_file) if getattr(args, "translation_probs_file", False): with open(args.translation_probs_file, "w") as out_file: for hypo_score in translated_scores: print(np.exp(hypo_score), file=out_file) if oracle_scorer is not None: print( f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}" ) if rescoring_bleu_scorer is not None: print( f"| Rescoring BLEU (top hypo in beam after rescoring):{rescoring_bleu_scorer.result_string()}" ) return scorer, num_sentences, gen_timer, translation_samples