Пример #1
0
 def test_noise_span_to_unique_sentinel(self):
     vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000)
     tokens = tf.constant([10, 11, 12, 13, 14, 15])
     noise_mask = tf.constant([True, True, False, False, True, False])
     expected_output = [999, 12, 13, 998, 15]
     output = self.evaluate(
         prep.noise_span_to_unique_sentinel(tokens, noise_mask, vocabulary))
     self.assertAllEqual(output, expected_output)
Пример #2
0
 def test_drop_nonnoise_tokens(self):
     vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000)
     tokens = tf.constant([10, 11, 12, 13, 14, 15])
     noise_mask = tf.constant([True, True, False, False, True, False])
     expected_output = [10, 11, 14]
     output = self.evaluate(
         prep.drop_nonnoise_tokens(tokens, noise_mask, vocabulary))
     self.assertAllEqual(output, expected_output)
Пример #3
0
 def test_noise_token_to_gathered_token(self):
     tf.random.set_seed(55)
     vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000)
     tokens = tf.constant([10, 11, 12, 13, 14, 15])
     noise_mask = tf.constant([True, True, False, False, True, False])
     expected_output = [11, 11, 12, 13, 15, 15]
     output = self.evaluate(
         prep.noise_token_to_gathered_token(tokens, noise_mask, vocabulary))
     self.assertAllEqual(output, expected_output)
Пример #4
0
 def test_noise_token_to_random_token_or_sentinel(self):
     tf.random.set_seed(55)
     vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000)
     tokens = tf.constant(list(range(10)))
     noise_mask = tf.constant(
         [True, True, False, False, True, False, True, True, True, True])
     expected_output = [436, 999, 2, 3, 999, 5, 999, 999, 999, 999]
     output = self.evaluate(
         prep.noise_token_to_random_token_or_sentinel(tokens,
                                                      noise_mask,
                                                      vocabulary,
                                                      random_prob=0.2))
     self.assertAllEqual(output, expected_output)
Пример #5
0
    def test_tokenize(self):
        og_dataset = tf.data.Dataset.from_tensors({
            'prefix': 'This is',
            'suffix': 'a test.'
        })
        output_features = {
            'prefix': Feature(test_utils.MockVocabulary({'This is': [0, 1]})),
            'suffix': Feature(test_utils.MockVocabulary({'a test.': [2, 3]})),
        }

        assert_dataset(
            prep.tokenize(og_dataset, output_features=output_features), {
                'prefix': [0, 1],
                'prefix_plaintext': 'This is',
                'suffix': [2, 3],
                'suffix_plaintext': 'a test.'
            })
        assert_dataset(
            prep.tokenize(og_dataset,
                          output_features=output_features,
                          copy_plaintext=False), {
                              'prefix': [0, 1],
                              'suffix': [2, 3]
                          })