def test_invalid_inputs(self): with self.assertRaisesRegex(ValueError, 'cannot be `None`'): _ = hashing.Hashing(num_bins=None) with self.assertRaisesRegex(ValueError, 'cannot be `None`'): _ = hashing.Hashing(num_bins=-1) with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=2, salt='string') with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=2, salt=[1]) with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'): _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
def test_hash_dense_input_mask_value_farmhash(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='') omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar') inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]) empty_mask_output = empty_mask_layer(inp) omar_mask_output = omar_mask_layer(inp) # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth # bin is now reserved for masks). self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output) # 'omar' should map to 0. self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
def test_hash_ragged_input_mask_value(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='') omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar') inp_data = ragged_factory_ops.constant( [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=dtypes.string) empty_mask_output = empty_mask_layer(inp_data) omar_mask_output = omar_mask_layer(inp_data) # Outputs should be one more than test_hash_ragged_string_input_farmhash # (the zeroth bin is now reserved for masks). expected_output = [[1, 1, 2, 1], [2, 1, 1]] self.assertAllClose(expected_output, empty_mask_output) # 'omar' should map to 0. expected_output = [[0, 1, 2, 1], [2, 1, 1]] self.assertAllClose(expected_output, omar_mask_output)
def test_hash_sparse_input_mask_value_farmhash(self): empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='') omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar') indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = sparse_tensor.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'], dense_shape=[3, 2]) empty_mask_output = empty_mask_layer(inp) omar_mask_output = omar_mask_layer(inp) self.assertAllClose(indices, omar_mask_output.indices) self.assertAllClose(indices, empty_mask_output.indices) # Outputs should be one more than test_hash_sparse_input_farmhash (the # zeroth bin is now reserved for masks). self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values) # 'omar' should map to 0. self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
def test_hash_sparse_int_input_siphash(self): layer = hashing.Hashing(num_bins=3, salt=[133, 137]) indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]] inp = sparse_tensor.SparseTensor( indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2]) output = layer(inp) self.assertAllClose(indices, output.indices) self.assertAllClose([1, 1, 2, 0, 1], output.values)
def test_hash_sparse_multi_inputs_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) indices = [[0, 0], [1, 0], [2, 0]] inp_1 = sparse_tensor.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo'], dense_shape=[3, 1]) inp_2 = sparse_tensor.SparseTensor( indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1]) output = layer([inp_1, inp_2]) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose(indices, output.indices) self.assertAllClose([0, 1, 0], output.values) layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137]) output = layer_2([inp_1, inp_2]) # The result should be same with test_hash_dense_input_siphash. self.assertAllClose([1, 1, 1], output.values)
def test_hash_ragged_string_multi_inputs_siphash(self): layer = hashing.Hashing(num_bins=2, salt=[133, 137]) inp_data_1 = ragged_factory_ops.constant( [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=dtypes.string) inp_data_2 = ragged_factory_ops.constant( [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']], dtype=dtypes.string) with self.assertRaisesRegexp(ValueError, 'not supported yet'): _ = layer([inp_data_1, inp_data_2])
def test_hash_sparse_multi_inputs_farmhash(self): layer = hashing.Hashing(num_bins=2) indices = [[0, 0], [1, 0], [2, 0]] inp_1 = sparse_tensor.SparseTensor( indices=indices, values=['omar', 'stringer', 'marlo'], dense_shape=[3, 1]) inp_2 = sparse_tensor.SparseTensor( indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1]) output = layer([inp_1, inp_2]) self.assertAllClose(indices, output.indices) self.assertAllClose([0, 0, 1], output.values)
def test_hash_ragged_int_input_siphash(self): layer = hashing.Hashing(num_bins=3, salt=[133, 137]) inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=dtypes.int64) out_data = layer(inp_data) # Same hashed output as test_hash_sparse_input_farmhash expected_output = [[1, 1, 0, 1], [2, 1, 1]] self.assertAllEqual(expected_output, out_data) inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.int64) out_t = layer(inp_t) model = training.Model(inputs=inp_t, outputs=out_t) self.assertAllClose(out_data, model.predict(inp_data))
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data_a = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) data_b = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string) input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string) crossed_data = category_crossing.CategoryCrossing()([input_1, input_2]) hashed_data = hashing.Hashing(num_buckets)(crossed_data) model = keras.Model([input_1, input_2], hashed_data) # FC implementation fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(hashing.Hashing(num_buckets)) # FC implementation fc = sfc.sequence_categorical_column_with_hash_bucket("data", num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time