def test_invalid_inputs(self):
   with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
     _ = hashing.Hashing(num_bins=None)
   with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
     _ = hashing.Hashing(num_bins=-1)
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=2, salt='string')
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=2, salt=[1])
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
예제 #2
0
 def test_hash_dense_input_mask_value_farmhash(self):
     empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
     omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
     inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                       ['skywalker']])
     empty_mask_output = empty_mask_layer(inp)
     omar_mask_output = omar_mask_layer(inp)
     # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth
     # bin is now reserved for masks).
     self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output)
     # 'omar' should map to 0.
     self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
 def test_hash_ragged_input_mask_value(self):
   empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
   omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
   inp_data = ragged_factory_ops.constant(
       [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
       dtype=dtypes.string)
   empty_mask_output = empty_mask_layer(inp_data)
   omar_mask_output = omar_mask_layer(inp_data)
   # Outputs should be one more than test_hash_ragged_string_input_farmhash
   # (the zeroth bin is now reserved for masks).
   expected_output = [[1, 1, 2, 1], [2, 1, 1]]
   self.assertAllClose(expected_output, empty_mask_output)
   # 'omar' should map to 0.
   expected_output = [[0, 1, 2, 1], [2, 1, 1]]
   self.assertAllClose(expected_output, omar_mask_output)
예제 #4
0
 def test_hash_sparse_input_mask_value_farmhash(self):
     empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
     omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
     inp = sparse_tensor.SparseTensor(
         indices=indices,
         values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
         dense_shape=[3, 2])
     empty_mask_output = empty_mask_layer(inp)
     omar_mask_output = omar_mask_layer(inp)
     self.assertAllClose(indices, omar_mask_output.indices)
     self.assertAllClose(indices, empty_mask_output.indices)
     # Outputs should be one more than test_hash_sparse_input_farmhash (the
     # zeroth bin is now reserved for masks).
     self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values)
     # 'omar' should map to 0.
     self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
 def test_hash_sparse_int_input_siphash(self):
   layer = hashing.Hashing(num_bins=3, salt=[133, 137])
   indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
   inp = sparse_tensor.SparseTensor(
       indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
   output = layer(inp)
   self.assertAllClose(indices, output.indices)
   self.assertAllClose([1, 1, 2, 0, 1], output.values)
예제 #6
0
  def test_hash_sparse_multi_inputs_siphash(self):
    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
    indices = [[0, 0], [1, 0], [2, 0]]
    inp_1 = sparse_tensor.SparseTensor(
        indices=indices,
        values=['omar', 'stringer', 'marlo'],
        dense_shape=[3, 1])
    inp_2 = sparse_tensor.SparseTensor(
        indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
    output = layer([inp_1, inp_2])
    # The result should be same with test_hash_dense_input_siphash.
    self.assertAllClose(indices, output.indices)
    self.assertAllClose([0, 1, 0], output.values)

    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
    output = layer_2([inp_1, inp_2])
    # The result should be same with test_hash_dense_input_siphash.
    self.assertAllClose([1, 1, 1], output.values)
예제 #7
0
 def test_hash_ragged_string_multi_inputs_siphash(self):
   layer = hashing.Hashing(num_bins=2, salt=[133, 137])
   inp_data_1 = ragged_factory_ops.constant(
       [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
       dtype=dtypes.string)
   inp_data_2 = ragged_factory_ops.constant(
       [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
       dtype=dtypes.string)
   with self.assertRaisesRegexp(ValueError, 'not supported yet'):
     _ = layer([inp_data_1, inp_data_2])
예제 #8
0
 def test_hash_sparse_multi_inputs_farmhash(self):
   layer = hashing.Hashing(num_bins=2)
   indices = [[0, 0], [1, 0], [2, 0]]
   inp_1 = sparse_tensor.SparseTensor(
       indices=indices,
       values=['omar', 'stringer', 'marlo'],
       dense_shape=[3, 1])
   inp_2 = sparse_tensor.SparseTensor(
       indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
   output = layer([inp_1, inp_2])
   self.assertAllClose(indices, output.indices)
   self.assertAllClose([0, 0, 1], output.values)
  def test_hash_ragged_int_input_siphash(self):
    layer = hashing.Hashing(num_bins=3, salt=[133, 137])
    inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
                                           dtype=dtypes.int64)
    out_data = layer(inp_data)
    # Same hashed output as test_hash_sparse_input_farmhash
    expected_output = [[1, 1, 0, 1], [2, 1, 1]]
    self.assertAllEqual(expected_output, out_data)

    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
    out_t = layer(inp_t)
    model = training.Model(inputs=inp_t, outputs=out_t)
    self.assertAllClose(out_data, model.predict(inp_data))
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data_a = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)
    data_b = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)

    # Keras implementation
    input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string)
    input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string)
    crossed_data = category_crossing.CategoryCrossing()([input_1, input_2])
    hashed_data = hashing.Hashing(num_buckets)(crossed_data)
    model = keras.Model([input_1, input_2], hashed_data)

    # FC implementation
    fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
예제 #11
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.0)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(hashing.Hashing(num_buckets))

    # FC implementation
    fc = sfc.sequence_categorical_column_with_hash_bucket("data", num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time