예제 #1
0
    def assert_word_shuffle_matches_expected(
        self,
        x,
        x_len,
        max_shuffle_distance: int,
        vocab: Dictionary,
        expected_shufle_maps: List[Dict[int, int]],
        expect_eos_at_end: bool,
        bpe_end_marker=None,
    ):
        """
        This verifies that with a given x, x_len, max_shuffle_distance, and
        vocab, we get the expected shuffle result.

        Args:
            x: Tensor of shape (T x B) = (sequence_length, batch_size)
            x_len: Tensor of length B = batch_size
            max_shuffle_distance: arg to pass to noising
            expected_shuffle_maps: List[mapping] where mapping is a
                Dict[old_index, new_index], mapping x's elements from their
                old positions in x to their new positions in x.
            expect_eos_at_end: if True, check the output to make sure there is
                an EOS at the end.
            bpe_end_marker: str denoting the BPE end token. If this is not None, we
                set the BPE cont token to None in the noising classes.
        """
        bpe_cont_marker = None
        if bpe_end_marker is None:
            bpe_cont_marker = "@@"

        with data_utils.numpy_seed(1234):
            word_shuffle = noising.WordShuffle(
                vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker
            )
            x_noised, l_noised = word_shuffle.noising(
                x, x_len, max_shuffle_distance=max_shuffle_distance
            )

        # For every example, we have a different expected shuffle map. We check
        # that each example is shuffled as expected according to each
        # corresponding shuffle map.
        for i in range(len(expected_shufle_maps)):
            shuffle_map = expected_shufle_maps[i]
            for k, v in shuffle_map.items():
                self.assertEqual(x[k][i], x_noised[v][i])

        # Shuffling should not affect the length of each example
        for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
            self.assertEqual(pre_shuffle_length, post_shuffle_length)
        if expect_eos_at_end:
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
예제 #2
0
 def _random_shuffle(target_tokens, p, max_shuffle_distance):
     word_shuffle = noising.WordShuffle(self.tgt_dict)
     target_mask = target_tokens.eq(self.tgt_dict.pad())
     target_length = target_mask.size(1) - target_mask.long().sum(1)
     prev_target_tokens, _ = word_shuffle.noising(
         target_tokens.t().cpu(), target_length.cpu(),
         max_shuffle_distance)
     prev_target_tokens = prev_target_tokens.to(
         target_tokens.device).t()
     masks = (target_tokens.clone().sum(
         dim=1, keepdim=True).float().uniform_(0, 1) < p)
     prev_target_tokens = masks * prev_target_tokens + (
         ~masks) * target_tokens
     return prev_target_tokens
예제 #3
0
    def test_word_shuffle_with_eos_nonbpe(self):
        vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False)

        with data_utils.numpy_seed(1234):
            word_shuffle = noising.WordShuffle(vocab, bpe_cont_marker=None)

            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
            self.assert_no_shuffle_with_0_distance(x=x,
                                                   x_noised=x_noised,
                                                   x_len=x_len,
                                                   l_noised=l_noised)
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())

            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
            self.assert_nonbpe_shuffle_with_distance_3(x=x,
                                                       x_noised=x_noised,
                                                       x_len=x_len,
                                                       l_noised=l_noised)
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
예제 #4
0
    def test_word_shuffle(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            word_shuffle = noising.WordShuffle(vocab)

            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
            for i in range(len(x_len)):
                for j in range(x_len[i]):
                    self.assertEqual(x[j][i], x_noised[j][i])
            self.assertEqual(x_len[0], l_noised[0])

            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
            # Expect the second example has the last three tokens shuffled
            # 6, 7, 8, 9 => 6, 8, 9, 7, where (8, 9) is a word
            for i in range(x_len[0]):
                self.assertEqual(x[i][0], x_noised[i][0])
            shuffle_map = {0: 0, 1: 3, 2: 1, 3: 2}
            for k, v in shuffle_map.items():
                self.assertEqual(x[k][1], x_noised[v][1])
            self.assertEqual(x_len[0], l_noised[0])
            self.assertEqual(x_len[1], l_noised[1])
예제 #5
0
    def test_word_shuffle_without_eos(self):
        """ Same result as word shuffle with eos except no EOS at end """
        vocab, x, x_len = self._get_test_data(append_eos=False)

        with data_utils.numpy_seed(1234):
            word_shuffle = noising.WordShuffle(vocab)

            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
            self.assert_no_shuffle_with_0_distance(x=x,
                                                   x_noised=x_noised,
                                                   x_len=x_len,
                                                   l_noised=l_noised)
            self.assert_no_eos_at_end(x=x_noised,
                                      x_len=l_noised,
                                      eos=vocab.eos())

            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
            self.assert_word_shuffle_with_distance_3(x=x,
                                                     x_noised=x_noised,
                                                     x_len=x_len,
                                                     l_noised=l_noised)
            self.assert_no_eos_at_end(x=x_noised,
                                      x_len=l_noised,
                                      eos=vocab.eos())