def test_special_tokens_partial(self): # [UNK] token is required by fast wordpiece tokenizer. vocab_file = self._make_vocab_file( ["[PAD]", "[CLS]", "[SEP]", "[UNK]"]) bert_tokenize = text_layers.FastWordpieceBertTokenizer( vocab_file=vocab_file, lower_case=True) self.assertDictEqual(bert_tokenize.get_special_tokens_dict(), dict(padding_id=0, start_of_sequence_id=1, end_of_segment_id=2, vocab_size=4)) # No mask_id,
def test_special_tokens_complete(self): vocab_file = self._make_vocab_file( ["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"]) bert_tokenize = text_layers.FastWordpieceBertTokenizer( vocab_file=vocab_file, lower_case=True) self.assertDictEqual( bert_tokenize.get_special_tokens_dict(), dict(padding_id=1, start_of_sequence_id=3, end_of_segment_id=4, mask_id=5, vocab_size=7))
def test_cased(self): vocab_file = self._make_vocab_file( ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"]) bert_tokenize = text_layers.FastWordpieceBertTokenizer( vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True) inputs = tf.constant(["abc def", "ABC DEF"]) token_ids, start_offsets, limit_offsets = bert_tokenize(inputs) self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]], [[7], [1]]])) self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]], [[0], [4]]])) self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]], [[3], [7]]]))
def input_fn(): with tf.init_scope(): self.assertFalse(tf.executing_eagerly()) # Build a preprocessing Model. sentences = tf.keras.layers.Input(shape=[], dtype=tf.string) bert_tokenizer = text_layers.FastWordpieceBertTokenizer( vocab_file=vocab_file, lower_case=True) special_tokens_dict = bert_tokenizer.get_special_tokens_dict() for k, v in special_tokens_dict.items(): self.assertIsInstance(v, int, "Unexpected type for {}".format(k)) tokens = bert_tokenizer(sentences) packed_inputs = text_layers.BertPackInputs( 4, special_tokens_dict=special_tokens_dict)(tokens) preprocessing = tf.keras.Model(sentences, packed_inputs) # Map the dataset. ds = tf.data.Dataset.from_tensors( (tf.constant(["abc", "DEF"]), tf.constant([0, 1]))) ds = ds.map(lambda features, labels: (preprocessing(features), labels)) return ds