def test_to_ids(self): pad, _, bos, eos = shakespeare_dataset.get_special_tokens() to_tokens = shakespeare_dataset._build_tokenize_fn(split_length=5) tokens = self.evaluate(to_tokens({'snippets': tf.constant('abc')})) self.assertAllEqual(tokens, [bos, 64, 42, 21, eos]) to_tokens = shakespeare_dataset._build_tokenize_fn(split_length=12) tokens = self.evaluate(to_tokens({'snippets': tf.constant('star wars')})) self.assertAllEqual(tokens, [bos, 25, 5, 64, 46, 14, 26, 64, 46, 25, eos, pad])
def test_last_id_not_oov(self): _, oov, bos, eos = shakespeare_dataset.get_special_tokens() to_tokens = shakespeare_dataset._build_tokenize_fn(split_length=5) tokens = to_tokens({'snippets': tf.constant('a\r~')}) self.assertAllEqual(tokens, [bos, 64, 86, oov, eos])