def test_forward_backward_explicit_vocab(self):
        vocab_data = [42, 1138, 725, 1729]
        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
        expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42,
                                                            -1]])

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
        inverse_layer = integer_lookup.IntegerLookup(vocabulary=vocab_data,
                                                     invert=True)
        int_data = layer(input_data)
        inverse_data = inverse_layer(int_data)
        model = keras.Model(inputs=input_data, outputs=inverse_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
 def test_non_unique_vocab_from_file_fails(self):
     vocab_list = [42, 1138, 725, 1729, 42]
     vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
     with self.assertRaisesRegex(
             tf.errors.FailedPreconditionError,
             ".*HashTable has different value for same key.*42.*"):
         _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
 def test_layer_with_list_input(self):
     vocab = [12, 36, 1138, 42]
     data = [[12, 1138, 42], [42, 1000, 36]]  # Note OOV tokens
     layer = integer_lookup.IntegerLookup(vocabulary=vocab)
     output = layer(data)
     expected_output = np.array([[1, 3, 4], [4, 0, 2]])
     self.assertEqual(output.numpy().tolist(), expected_output.tolist())
 def test_get_vocab_returns_int(self):
     vocab_data = [42, 1138, 725, 1729]
     expected_vocab = [-1, 42, 1138, 725, 1729]
     layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
     layer_vocab = layer.get_vocabulary()
     self.assertAllEqual(expected_vocab, layer_vocab)
     self.assertIsInstance(layer_vocab[0], np.int64)
    def test_too_long_vocab_fails_in_single_setting(self):
        vocab_data = [42, 1138, 725, 1729]

        layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1)
        with self.assertRaisesRegex(
                ValueError, "vocabulary larger than the maximum vocab.*"):
            layer.set_vocabulary(vocab_data)
    def test_sparse_int_input_multi_bucket(self):
        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
        input_array = tf.SparseTensor(
            indices=[[0, 0], [1, 2]],
            values=np.array([13, 133], dtype=np.int64),
            dense_shape=[3, 4],
        )

        expected_indices = [[0, 0], [1, 2]]
        expected_values = [6, 2]
        expected_dense_shape = [3, 4]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64, sparse=True)
        layer = integer_lookup.IntegerLookup(
            max_tokens=None,
            dtype=tf.int64,
            num_oov_indices=2,
            mask_token=0,
            oov_token=-1,
        )
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(input_array, steps=1)
        self.assertAllEqual(expected_indices, output_data.indices)
        self.assertAllEqual(expected_values, output_data.values)
        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
    def test_vocabulary_persistence_across_saving(self):
        vocab_data = [42, 1138, 725, 1729]
        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

        # Build and validate a golden model.
        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(max_tokens=None,
                                             num_oov_indices=1)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(output_dataset, expected_output)

        # Save the model to disk.
        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
        model.save(output_path, save_format="tf")

        # Delete the session and graph to ensure that the loaded model is generated
        # from scratch.
        # TODO(b/149526183): Can't clear session when TF2 is disabled.
        if tf.__internal__.tf2.enabled():
            keras.backend.clear_session()

        loaded_model = keras.models.load_model(
            output_path,
            custom_objects={"IntegerLookup": integer_lookup.IntegerLookup})

        # Ensure that the loaded model is unique (so that the save/load is real)
        self.assertIsNot(model, loaded_model)

        # Validate correctness of the new model.
        new_output_dataset = loaded_model.predict(input_array)
        self.assertAllEqual(new_output_dataset, expected_output)
    def test_ragged_adapt(self):
        vocab_data = tf.ragged.constant([[203], [1729, 203]])
        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)

        layer = integer_lookup.IntegerLookup()
        layer.adapt(vocab_dataset)
        expected_vocabulary = [-1, 203, 1729]
        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
    def test_sparse_output(self):
        vocab_data = [2, 3, 4, 5]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data,
                                             output_mode="multi_hot",
                                             sparse=True)
        res = layer(input_data)
        self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
示例#10
0
    def test_sparse_adapt(self):
        vocab_data = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 2]],
                                     values=[203, 1729, 203],
                                     dense_shape=[3, 4])
        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)

        layer = integer_lookup.IntegerLookup()
        layer.adapt(vocab_dataset)
        expected_vocabulary = [-1, 203, 1729]
        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
示例#11
0
 def test_tensor_vocab(self):
     vocab_data = [-1, 42, 1138, 725, 1729]
     vocab_tensor = tf.constant(vocab_data, tf.int64)
     layer = integer_lookup.IntegerLookup(vocabulary=vocab_tensor)
     returned_vocab = layer.get_vocabulary()
     self.assertAllEqual(vocab_data, returned_vocab)
     self.assertAllEqual(layer.vocabulary_size(), 5)
     fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
     with self.assertRaisesRegex(RuntimeError,
                                 "Cannot set a tensor vocabulary"):
         fn()
示例#12
0
    def test_int_output_explicit_vocab(self):
        vocab_data = [42, 1138, 725, 1729]
        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#13
0
    def test_int_output_with_mask(self):
        vocab_data = [42, 1138, 725, 1729]
        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=0)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#14
0
    def test_count_output(self):
        vocab_data = [2, 3, 4, 5]
        input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 6]])
        expected_output = [[0, 2, 1, 1, 0], [3, 0, 0, 0, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data,
                                             output_mode="count")
        res = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=res)
        output_data = model.predict(input_array)
        self.assertAllEqual(expected_output, output_data)
示例#15
0
    def test_inverse_output(self):
        vocab_data = [0, -1, 42, 1138, 725, 1729]
        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
        expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42,
                                                            -1]])

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(invert=True)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#16
0
    def test_int_output_explicit_vocab_from_file(self):
        vocab_list = [42, 1138, 725, 1729]
        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)

        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_path)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#17
0
    def test_ragged_int_input(self):
        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
        input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]],
                                         dtype=np.int64)
        expected_output = [[1, 2, 4], [4, 3, 1, 0]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64, ragged=True)
        layer = integer_lookup.IntegerLookup(max_tokens=None)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#18
0
    def test_ragged_int_input_multi_bucket(self):
        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
        input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 133]],
                                         dtype=np.int64)
        expected_output = [[3, 4, 6], [6, 5, 3, 2]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64, ragged=True)
        layer = integer_lookup.IntegerLookup(max_values=None,
                                             num_oov_indices=2)
        layer.set_vocabulary(vocab_data)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#19
0
    def test_single_int_generator_dataset(self):
        def word_gen():
            for _ in itertools.count(1):
                yield random.randint(0, 100)

        ds = tf.data.Dataset.from_generator(word_gen, tf.int64,
                                            tf.TensorShape([]))
        batched_ds = ds.take(2)
        input_t = keras.Input(shape=(), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(max_tokens=10,
                                             num_oov_indices=0,
                                             mask_token=None,
                                             oov_token=None)
        _ = layer(input_t)
        layer.adapt(batched_ds)
示例#20
0
    def test_int_output_no_oov(self):
        vocab_data = [42, 1138, 725, 1729]
        valid_input = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 0]])
        invalid_input = np.array([[42, 1138, 725, 203], [1729, 725, 42, 203]])
        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

        input_data = keras.Input(shape=(None, ), dtype=tf.int64)
        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data,
                                             mask_token=0,
                                             num_oov_indices=0)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_data = model.predict(valid_input)
        self.assertAllEqual(expected_output, output_data)
        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                    "found OOV values.*203"):
            _ = model.predict(invalid_input)
示例#21
0
 def test_zero_max_tokens_fails(self):
     with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
         _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
示例#22
0
 def test_no_vocab(self):
     with self.assertRaisesRegex(ValueError,
                                 "You must set the layer's vocabulary"):
         layer = integer_lookup.IntegerLookup()
         layer([[1]])
示例#23
0
 def test_output_shape(self):
     input_data = keras.Input(shape=(4, ), dtype=tf.int64)
     layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
示例#24
0
 def test_non_unique_vocab_fails(self):
     vocab_data = [42, 1138, 725, 1729, 1729]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
         _ = integer_lookup.IntegerLookup(vocabulary=vocab_data)
示例#25
0
 def test_non_unique_vocab_from_file_fails(self):
     vocab_list = [42, 1138, 725, 1729, 42]
     vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
         _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
示例#26
0
 def test_no_vocab(self):
     with self.assertRaisesRegex(RuntimeError,
                                 "you must set the layer's vocabulary"):
         layer = integer_lookup.IntegerLookup(output_mode="binary")
         layer([[1]])