def test_saving(self): sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0) inputs = tf.keras.layers.Input([], dtype=tf.string) outputs = sentencepiece_tokenizer(inputs) model = tf.keras.Model(inputs, outputs) export_path = tempfile.mkdtemp(dir=self.get_temp_dir()) model.save(export_path, signatures={})
def test_special_tokens(self): sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0) self.assertDictEqual(sentencepiece_tokenizer.get_special_tokens_dict(), dict(padding_id=0, start_of_sequence_id=2, end_of_segment_id=3, mask_id=4, vocab_size=16))
def test_fail_on_tokenize_with_offsets_and_strip_diacritics(self): # Raise an error in init(). with self.assertRaises(ValueError): text_layers.SentencepieceTokenizer(model_file_path=self._spm_path, tokenize_with_offsets=True, lower_case=True, nbest_size=0, strip_diacritics=True) sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0, strip_diacritics=True) sentencepiece_tokenizer.tokenize_with_offsets = True # Raise an error in call(): inputs = tf.constant(["abc def", "ABC DEF d", "Äffin"]) with self.assertRaises(ValueError): sentencepiece_tokenizer(inputs)
def test_strip_diacritics(self): sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0, strip_diacritics=True) inputs = tf.constant(["a b c d e", "ă ḅ č ḓ é"]) token_ids = sentencepiece_tokenizer(inputs) self.assertAllEqual( token_ids, tf.ragged.constant([[7, 9, 10, 11, 13], [7, 9, 10, 11, 13]]))
def test_serialize_deserialize(self): self.skipTest("b/170480226") sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=False, nbest_size=0, tokenize_with_offsets=False, name="sentencepiece_tokenizer_layer") config = sentencepiece_tokenizer.get_config() new_tokenizer = text_layers.SentencepieceTokenizer.from_config(config) self.assertEqual(config, new_tokenizer.get_config()) inputs = tf.constant(["abc def", "ABC DEF d"]) token_ids = sentencepiece_tokenizer(inputs) token_ids_2 = new_tokenizer(inputs) self.assertAllEqual(token_ids, token_ids_2)
def test_uncased(self): sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0) inputs = tf.constant(["abc def", "ABC DEF d"]) token_ids = sentencepiece_tokenizer(inputs) self.assertAllEqual(token_ids, tf.ragged.constant([[8, 12], [8, 12, 11]])) sentencepiece_tokenizer.tokenize_with_offsets = True token_ids_2, start_offsets, limit_offsets = sentencepiece_tokenizer( inputs) self.assertAllEqual(token_ids, token_ids_2) self.assertAllEqual(start_offsets, tf.ragged.constant([[0, 3], [0, 3, 7]])) self.assertAllEqual(limit_offsets, tf.ragged.constant([[3, 7], [3, 7, 9]])) self.assertEqual(sentencepiece_tokenizer.vocab_size.numpy(), 16)
def input_fn(): with tf.init_scope(): self.assertFalse(tf.executing_eagerly()) # Build a preprocessing Model. sentences = tf.keras.layers.Input(shape=[], dtype=tf.string) sentencepiece_tokenizer = text_layers.SentencepieceTokenizer( model_file_path=self._spm_path, lower_case=True, nbest_size=0) special_tokens_dict = sentencepiece_tokenizer.get_special_tokens_dict() for k, v in special_tokens_dict.items(): self.assertIsInstance(v, int, "Unexpected type for {}".format(k)) tokens = sentencepiece_tokenizer(sentences) packed_inputs = text_layers.BertPackInputs( 4, special_tokens_dict=special_tokens_dict)(tokens) preprocessing = tf.keras.Model(sentences, packed_inputs) # Map the dataset. ds = tf.data.Dataset.from_tensors( (tf.constant(["abc", "DEF"]), tf.constant([0, 1]))) ds = ds.map(lambda features, labels: (preprocessing(features), labels)) return ds