def test_with_normalization(self):
     generator = SequenceGenerator(self.tgt_dict, beam_size=2)
     hypos = generator.generate([self.model], self.sample)
     eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
     # sentence 1, beam 1
     self.assertHypoTokens(hypos[0][0], [w1, eos])
     self.assertHypoScore(hypos[0][0], [0.9, 1.0])
     # sentence 1, beam 2
     self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
     self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
     # sentence 2, beam 1
     self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
     self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0])
     # sentence 2, beam 2
     self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
     self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6])
 def test_no_stop_early(self):
     generator = SequenceGenerator(self.tgt_dict, stop_early=False, beam_size=2)
     hypos = generator.generate([self.model], self.sample)
     eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
     # sentence 1, beam 1
     self.assertHypoTokens(hypos[0][0], [w1, eos])
     self.assertHypoScore(hypos[0][0], [0.9, 1.0])
     # sentence 1, beam 2
     self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
     self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
     # sentence 2, beam 1
     self.assertHypoTokens(hypos[1][0], [w2, w2, w2, w2, eos])
     self.assertHypoScore(hypos[1][0], [0.3, 0.9, 0.99, 0.4, 1.0])
     # sentence 2, beam 2
     self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
     self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0])
 def test_with_lenpen_favoring_long_hypos(self):
     lenpen = 5.0
     generator = SequenceGenerator(self.tgt_dict, beam_size=2, len_penalty=lenpen)
     hypos = generator.generate([self.model], self.sample)
     eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
     # sentence 1, beam 1
     self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos])
     self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
     # sentence 1, beam 2
     self.assertHypoTokens(hypos[0][1], [w1, eos])
     self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen)
     # sentence 2, beam 1
     self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
     self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
     # sentence 2, beam 2
     self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
     self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen)
 def test_without_normalization(self):
     # Sentence 1: unchanged from the normalized case
     # Sentence 2: beams swap order
     generator = SequenceGenerator(self.tgt_dict, beam_size=2, normalize_scores=False)
     hypos = generator.generate([self.model], self.sample)
     eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
     # sentence 1, beam 1
     self.assertHypoTokens(hypos[0][0], [w1, eos])
     self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False)
     # sentence 1, beam 2
     self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
     self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False)
     # sentence 2, beam 1
     self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
     self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False)
     # sentence 2, beam 2
     self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
     self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False)
 def test_diverse_beam_search(self):
     generator = SequenceGenerator(
         self.tgt_dict, beam_size=2, diverse_beam_groups=2, diverse_beam_strength=0.,
     )
     sample = {'net_input': {'src_tokens': self.src_tokens, 'src_lengths': self.src_lengths}}
     hypos = generator.generate([self.model], sample)
     eos, w1, w2 = self.eos, self.w1, self.w2
     # sentence 1, beam 1
     self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
     self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0])
     # sentence 1, beam 2
     self.assertHypoTokens(hypos[0][1], [w1, w1, eos])
     self.assertHypoScore(hypos[0][1], [0.9, 0.6, 1.0])
     # sentence 2, beam 1
     self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
     self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9])
     # sentence 2, beam 2
     self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
     self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.9])
예제 #6
0
 def build_generator(self, args):
     if args.score_reference:
         from fairseq.fairseq.sequence_scorer import SequenceScorer
         return SequenceScorer(self.target_dictionary)
     else:
         from fairseq.fairseq.sequence_generator import SequenceGenerator
         return SequenceGenerator(
             self.target_dictionary,
             beam_size=args.beam,
             max_len_a=args.max_len_a,
             max_len_b=args.max_len_b,
             min_len=args.min_len,
             stop_early=(not args.no_early_stop),
             normalize_scores=(not args.unnormalized),
             len_penalty=args.lenpen,
             unk_penalty=args.unkpen,
             sampling=args.sampling,
             sampling_topk=args.sampling_topk,
             sampling_temperature=args.sampling_temperature,
             diverse_beam_groups=args.diverse_beam_groups,
             diverse_beam_strength=args.diverse_beam_strength,
             match_source_len=args.match_source_len,
             no_repeat_ngram_size=args.no_repeat_ngram_size,
         )
    def _backtranslation_dataset_helper(
        self,
        remove_eos_from_input_src,
        remove_eos_from_output_src,
    ):
        tgt_dataset = LanguagePairDataset(
            src=self.tgt_dataset,
            src_sizes=self.tgt_dataset.sizes,
            src_dict=self.tgt_dict,
            tgt=None,
            tgt_sizes=None,
            tgt_dict=None,
        )

        generator = SequenceGenerator(
            tgt_dict=self.tgt_dict,
            max_len_a=0,
            max_len_b=200,
            beam_size=2,
            unk_penalty=0,
            sampling=False,
        )

        backtranslation_dataset = BacktranslationDataset(
            tgt_dataset=TransformEosDataset(
                dataset=tgt_dataset,
                eos=self.tgt_dict.eos(),
                # remove eos from the input src
                remove_eos_from_src=remove_eos_from_input_src,
            ),
            backtranslation_fn=(lambda net_input: generator.generate(
                [self.model], {'net_input': net_input})),
            output_collater=TransformEosDataset(
                dataset=tgt_dataset,
                eos=self.tgt_dict.eos(),
                # if we remove eos from the input src, then we need to add it
                # back to the output tgt
                append_eos_to_tgt=remove_eos_from_input_src,
                remove_eos_from_src=remove_eos_from_output_src,
            ).collater,
            cuda=self.cuda,
        )
        dataloader = torch.utils.data.DataLoader(
            backtranslation_dataset,
            batch_size=2,
            collate_fn=backtranslation_dataset.collater,
        )
        backtranslation_batch_result = next(iter(dataloader))

        eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(
        ), self.w1, self.w2

        # Note that we sort by src_lengths and add left padding, so actually
        # ids will look like: [1, 0]
        expected_src = torch.LongTensor([[w1, w2, w1, eos],
                                         [pad, pad, w1, eos]])
        if remove_eos_from_output_src:
            expected_src = expected_src[:, :-1]
        expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
        generated_src = backtranslation_batch_result["net_input"]["src_tokens"]
        tgt_tokens = backtranslation_batch_result["target"]

        self.assertTensorEqual(expected_src, generated_src)
        self.assertTensorEqual(expected_tgt, tgt_tokens)