def test_noise_span_to_unique_sentinel(self): vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000) tokens = tf.constant([10, 11, 12, 13, 14, 15]) noise_mask = tf.constant([True, True, False, False, True, False]) expected_output = [999, 12, 13, 998, 15] output = self.evaluate( prep.noise_span_to_unique_sentinel(tokens, noise_mask, vocabulary)) self.assertAllEqual(output, expected_output)
def test_drop_nonnoise_tokens(self): vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000) tokens = tf.constant([10, 11, 12, 13, 14, 15]) noise_mask = tf.constant([True, True, False, False, True, False]) expected_output = [10, 11, 14] output = self.evaluate( prep.drop_nonnoise_tokens(tokens, noise_mask, vocabulary)) self.assertAllEqual(output, expected_output)
def test_noise_token_to_gathered_token(self): tf.random.set_seed(55) vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000) tokens = tf.constant([10, 11, 12, 13, 14, 15]) noise_mask = tf.constant([True, True, False, False, True, False]) expected_output = [11, 11, 12, 13, 15, 15] output = self.evaluate( prep.noise_token_to_gathered_token(tokens, noise_mask, vocabulary)) self.assertAllEqual(output, expected_output)
def test_noise_token_to_random_token_or_sentinel(self): tf.random.set_seed(55) vocabulary = test_utils.MockVocabulary({'foo': [10]}, vocab_size=1000) tokens = tf.constant(list(range(10))) noise_mask = tf.constant( [True, True, False, False, True, False, True, True, True, True]) expected_output = [436, 999, 2, 3, 999, 5, 999, 999, 999, 999] output = self.evaluate( prep.noise_token_to_random_token_or_sentinel(tokens, noise_mask, vocabulary, random_prob=0.2)) self.assertAllEqual(output, expected_output)
def test_tokenize(self): og_dataset = tf.data.Dataset.from_tensors({ 'prefix': 'This is', 'suffix': 'a test.' }) output_features = { 'prefix': Feature(test_utils.MockVocabulary({'This is': [0, 1]})), 'suffix': Feature(test_utils.MockVocabulary({'a test.': [2, 3]})), } assert_dataset( prep.tokenize(og_dataset, output_features=output_features), { 'prefix': [0, 1], 'prefix_plaintext': 'This is', 'suffix': [2, 3], 'suffix_plaintext': 'a test.' }) assert_dataset( prep.tokenize(og_dataset, output_features=output_features, copy_plaintext=False), { 'prefix': [0, 1], 'suffix': [2, 3] })