示例#1
0
    def test_dense_input_sparse_output(self):
        input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])

        # The expected output should be (X for missing value):
        # [[X, 1, 1, 1, X, X]
        #  [1, X, X, 2, X, X]]
        expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]]
        expected_values = [1, 1, 1, 1, 2]
        num_tokens = 6

        input_data = keras.Input(shape=(None, ), dtype=dtypes.int32)
        layer = category_encoding.CategoryEncoding(
            num_tokens=num_tokens,
            output_mode=category_encoding.COUNT,
            sparse=True)
        int_data = layer(input_data)

        model = keras.Model(inputs=input_data, outputs=int_data)
        sp_output_dataset = model.predict(input_array, steps=1)
        self.assertAllEqual(expected_values, sp_output_dataset.values)
        self.assertAllEqual(expected_indices, sp_output_dataset.indices)

        # Assert sparse output is same as dense output.
        layer = category_encoding.CategoryEncoding(
            num_tokens=num_tokens,
            output_mode=category_encoding.COUNT,
            sparse=False)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array, steps=1)
        self.assertAllEqual(
            sparse_ops.sparse_tensor_to_dense(sp_output_dataset,
                                              default_value=0), output_dataset)
示例#2
0
 def test_saving_loading(self):
   encoder = category_encoding.CategoryEncoding()
   encoder.adapt([1, 2, 3])
   model = keras.Sequential([encoder])
   model.save("/tmp/model", save_format="tf")
   loaded_model = keras.models.load_model("/tmp/model")
   self.assertAllClose(model.predict([[1]]), loaded_model.predict([[1]]))
    def run_dataset_implementation(self, output_mode, batch_size,
                                   sequence_length, max_tokens):
        input_t = keras.Input(shape=(sequence_length, ), dtype=dtypes.int32)
        layer = category_encoding.CategoryEncoding(max_tokens=max_tokens,
                                                   output_mode=output_mode)
        _ = layer(input_t)

        num_repeats = 5
        starts = []
        ends = []
        for _ in range(num_repeats):
            ds = dataset_ops.Dataset.from_tensor_slices(
                random_ops.random_uniform([batch_size * 10, sequence_length],
                                          minval=0,
                                          maxval=max_tokens - 1,
                                          dtype=dtypes.int32))
            ds = ds.shuffle(batch_size * 100)
            ds = ds.batch(batch_size)
            num_batches = 5
            ds = ds.take(num_batches)
            ds = ds.prefetch(num_batches)
            starts.append(time.time())
            # Benchmarked code begins here.
            for i in ds:
                _ = layer(i)
            # Benchmarked code ends here.
            ends.append(time.time())

        avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
        name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
            batch_size, sequence_length, max_tokens)
        self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 def test_multi_hot_rank_3_output_fails(self):
   layer = category_encoding.CategoryEncoding(
       num_tokens=4, output_mode=category_encoding.ONE_HOT)
   with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
     _ = layer(keras.Input(shape=(3, 4,), dtype=dtypes.int32))
   with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
     _ = layer(np.array([[[3, 2, 0, 1], [3, 2, 0, 1]]]))
示例#5
0
    def test_sparse_input_sparse_output_with_weights(self):
        indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
        sp_inp = sparse_tensor.SparseTensor(indices=indices,
                                            values=[0, 2, 1, 1, 0],
                                            dense_shape=[4, 2])
        input_data = keras.Input(shape=(None, ),
                                 dtype=dtypes.int64,
                                 sparse=True)
        sp_weight = sparse_tensor.SparseTensor(indices=indices,
                                               values=[.1, .2, .4, .3, .2],
                                               dense_shape=[4, 2])
        weight_data = keras.Input(shape=(None, ),
                                  dtype=dtypes.float32,
                                  sparse=True)

        # The expected output should be (X for missing value):
        # [[1, X, X, X]
        #  [X, X, 1, X]
        #  [X, 2, X, X]
        #  [1, X, X, X]]
        expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
        expected_values = [.1, .2, .7, .2]
        num_tokens = 6

        layer = category_encoding.CategoryEncoding(
            num_tokens=num_tokens,
            output_mode=category_encoding.COUNT,
            sparse=True)
        int_data = layer(input_data, count_weights=weight_data)

        model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
        sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
        self.assertAllClose(expected_values, sp_output_dataset.values)
        self.assertAllEqual(expected_indices, sp_output_dataset.indices)
示例#6
0
    def test_sparse_input_with_weights(self):
        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
        weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]])
        sparse_tensor_data = sparse_ops.from_dense(input_array)
        sparse_weight_data = sparse_ops.from_dense(weights_array)

        # pyformat: disable
        expected_output = [[0, .1, .2, .3, .4, 0], [0, .4, 0, .1, .5, 0]]
        # pyformat: enable
        num_tokens = 6
        expected_output_shape = [None, num_tokens]

        input_data = keras.Input(shape=(None, ),
                                 dtype=dtypes.int64,
                                 sparse=True)
        weight_data = keras.Input(shape=(None, ),
                                  dtype=dtypes.float32,
                                  sparse=True)

        layer = category_encoding.CategoryEncoding(
            num_tokens=num_tokens, output_mode=category_encoding.COUNT)
        int_data = layer(input_data, count_weights=weight_data)
        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

        model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
        output_dataset = model.predict(
            [sparse_tensor_data, sparse_weight_data], steps=1)
        self.assertAllClose(expected_output, output_dataset)
示例#7
0
 def test_dense_negative(self):
     input_array = constant_op.constant([[1, 2, 0], [2, 2, -1]])
     num_tokens = 3
     expected_output_shape = [None, num_tokens]
     encoder_layer = category_encoding.CategoryEncoding(num_tokens)
     input_data = keras.Input(shape=(3, ), dtype=dtypes.int32)
     int_data = encoder_layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
     model = keras.Model(inputs=input_data, outputs=int_data)
     with self.assertRaisesRegex(
             errors.InvalidArgumentError,
             ".*must be in the range 0 <= values < num_tokens.*"):
         _ = model.predict(input_array, steps=1)
  def test_sparse_output_and_dense_layer(self):
    input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])

    num_tokens = 4

    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
    encoding_layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
    int_data = encoding_layer(input_data)
    dense_layer = keras.layers.Dense(units=1)
    output_data = dense_layer(int_data)

    model = keras.Model(inputs=input_data, outputs=output_data)
    _ = model.predict(input_array, steps=1)
示例#9
0
    def test_legacy_max_tokens_arg(self):
        input_array = np.array([[1, 2, 3, 1]])
        expected_output = [[0, 1, 1, 1, 0, 0]]
        num_tokens = 6
        expected_output_shape = [None, num_tokens]

        input_data = keras.Input(shape=(None, ), dtype=dtypes.int32)
        layer = category_encoding.CategoryEncoding(
            max_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
        int_data = layer(input_data)
        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
示例#10
0
 def test_dense_oov_input(self):
     valid_array = constant_op.constant([[0, 1, 2], [0, 1, 2]])
     invalid_array = constant_op.constant([[0, 1, 2], [2, 3, 1]])
     num_tokens = 3
     expected_output_shape = [None, num_tokens]
     encoder_layer = category_encoding.CategoryEncoding(num_tokens)
     input_data = keras.Input(shape=(3, ), dtype=dtypes.int32)
     int_data = encoder_layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
     model = keras.Model(inputs=input_data, outputs=int_data)
     # Call predict once on valid input to compile a graph and test control flow.
     _ = model.predict(valid_array, steps=1)
     with self.assertRaisesRegex(
             errors.InvalidArgumentError,
             ".*must be in the range 0 <= values < num_tokens.*"):
         _ = model.predict(invalid_array, steps=1)
示例#11
0
    def test_end_to_end_bagged_modeling(self, output_mode, num_tokens):
        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])

        input_data = keras.Input(shape=(None, ), dtype=dtypes.int32)
        layer = category_encoding.CategoryEncoding(num_tokens=num_tokens,
                                                   output_mode=output_mode)

        weights = []
        if num_tokens is None:
            layer.set_num_elements(5)
        layer.set_weights(weights)

        int_data = layer(input_data)
        float_data = backend.cast(int_data, dtype="float32")
        output_data = core.Dense(64)(float_data)
        model = keras.Model(inputs=input_data, outputs=output_data)
        _ = model.predict(input_array)
  def test_multi_hot_output(self):
    input_data = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
    expected_output = [
        [0, 1, 1, 1, 0, 0],
        [1, 1, 0, 1, 0, 0],
    ]
    num_tokens = 6
    expected_output_shape = [None, num_tokens]

    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
    inputs = keras.Input(shape=(None,), dtype=dtypes.int32)
    outputs = layer(inputs)
    model = keras.Model(inputs=inputs, outputs=outputs)
    output_data = model.predict(input_data)
    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
    self.assertAllEqual(expected_output, output_data)
示例#13
0
    def test_count_output(self):
        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])

        # pyformat: disable
        expected_output = [[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]]
        # pyformat: enable
        num_tokens = 6
        expected_output_shape = [None, num_tokens]

        input_data = keras.Input(shape=(None, ), dtype=dtypes.int32)
        layer = category_encoding.CategoryEncoding(
            num_tokens=6, output_mode=category_encoding.COUNT)
        int_data = layer(input_data)
        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab_size = 32768
    vocab = fc_bm.create_vocabulary(vocab_size)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
    model.add(
        category_encoding.CategoryEncoding(num_tokens=vocab_size + 1,
                                           output_mode="count"))

    # FC implementation
    fc = fcv2.indicator_column(
        fcv2.categorical_column_with_vocabulary_list(key="data",
                                                     vocabulary_list=vocab,
                                                     num_oov_buckets=1))

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
  def test_multi_hot_output_rank_zero_input(self):
    input_data = np.array(3)
    expected_output = [0, 0, 0, 1, 0, 0]
    num_tokens = 6
    expected_output_shape = [None, num_tokens]

    # Test call on layer directly.
    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
    output_data = layer(input_data)
    self.assertAllEqual(expected_output, output_data)

    # Test call on model.
    inputs = keras.Input(shape=(4,), dtype=dtypes.int32)
    outputs = layer(inputs)
    model = keras.Model(inputs=inputs, outputs=outputs)
    output_data = model(input_data)
    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
    self.assertAllEqual(expected_output, output_data)
  def test_one_hot_output(self):
    input_data = np.array([[3], [2], [0], [1]])
    expected_output = [
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
    ]
    num_tokens = 4
    expected_output_shape = [None, num_tokens]

    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
    inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
    outputs = layer(inputs)
    model = keras.Model(inputs=inputs, outputs=outputs)
    output_dataset = model(input_data)
    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
    self.assertAllEqual(expected_output, output_dataset)
示例#17
0
    def test_distribution(self, distribution):
        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
        inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
        inp_dataset = batch_wrapper(inp_dataset, 2, distribution)

        # pyformat: disable
        expected_output = [[0, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0]]
        # pyformat: enable
        max_tokens = 6
        config.set_soft_device_placement(True)

        with distribution.scope():
            input_data = keras.Input(shape=(4, ), dtype=dtypes.int32)
            layer = category_encoding.CategoryEncoding(
                max_tokens=max_tokens, output_mode=category_encoding.BINARY)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(inp_dataset)
        self.assertAllEqual(expected_output, output_dataset)
  def test_sparse_input(self):
    input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
    sparse_tensor_data = sparse_ops.from_dense(input_array)

    # pyformat: disable
    expected_output = [[0, 1, 1, 1, 0, 0],
                       [0, 1, 0, 1, 0, 0]]
    # pyformat: enable
    num_tokens = 6
    expected_output_shape = [None, num_tokens]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)

    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
    int_data = layer(input_data)
    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

    model = keras.Model(inputs=input_data, outputs=int_data)
    output_dataset = model.predict(sparse_tensor_data, steps=1)
    self.assertAllEqual(expected_output, output_dataset)
  def test_ragged_input(self):
    input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]])

    # pyformat: disable
    expected_output = [[0, 1, 1, 1, 0, 0],
                       [0, 1, 0, 1, 0, 0]]
    # pyformat: enable
    num_tokens = 6
    expected_output_shape = [None, num_tokens]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)

    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
    int_data = layer(input_data)

    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

    model = keras.Model(inputs=input_data, outputs=int_data)
    output_dataset = model.predict(input_array, steps=1)
    self.assertAllEqual(expected_output, output_dataset)
  def test_one_hot_output_rank_one_input(self):
    input_data = np.array([3, 2, 0, 1])
    expected_output = [
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
    ]
    num_tokens = 4
    expected_output_shape = [None, num_tokens]

    # Test call on layer directly.
    layer = category_encoding.CategoryEncoding(
        num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
    output_data = layer(input_data)
    self.assertAllEqual(expected_output, output_data)

    # Test call on model.
    inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
    outputs = layer(inputs)
    model = keras.Model(inputs=inputs, outputs=outputs)
    output_data = model(input_data)
    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
    self.assertAllEqual(expected_output, output_data)
示例#21
0
 def test_serialize(self):
   encoder = category_encoding.CategoryEncoding()
   encoder.adapt([1, 2, 3])
   model = keras.Sequential([encoder])
   _ = keras.models.clone_model(model)