def test_with_1d_unknown_shape_sparse_tensor(self): embedding_values = ( (1., 2.), # id 0 (6., 7.), # id 1 (11., 12.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=2, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': array_ops.placeholder(dtypes.float32), 'body-style': array_ops.sparse_placeholder(dtypes.string), # This is dense tensor for the categorical_column. 'country': array_ops.placeholder(dtypes.string), } self.assertIsNone(features['price'].shape.ndims) self.assertIsNone(features['body-style'].get_shape().ndims) self.assertIsNone(features['country'].shape.ndims) price_data = np.array([11., 12.]) body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )) country_data = np.array([['US'], ['CA']]) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 2, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]], sess.run(net, feed_dict={ features['price']: price_data, features['body-style']: body_style_data, features['country']: country_data }))
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=5, initializer=_initializer) with ops.Graph().as_default(): # Provides 1-dim tensor and dense tensor. features = { 'price': constant_op.constant([ 11., 12., ]), 'body-style': sparse_tensor.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': constant_op.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures( [price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def test_crossed_column(self): a = fc.categorical_column_with_vocabulary_list( 'a', vocabulary_list=['1', '2', '3']) b = fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']) ab = fc.crossed_column([a, b], hash_bucket_size=2) cols = [fc.indicator_column(ab)] orig_layer = df.DenseFeatures(cols) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertLen(new_layer._feature_columns, 1) self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
def test_from_config(self, trainable, name): cols = [ fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']), dimension=2), fc.indicator_column( fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)) ] orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = df.DenseFeatures.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0) self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b') self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__) self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__) self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
def test_from_config(self, units, sparse_combiner, trainable, name): cols = [ fc.numeric_column('a'), fc.categorical_column_with_vocabulary_list('b', vocabulary_list=('1', '2', '3')), fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3) ] orig_layer = fc._LinearModelLayer(cols, units=units, sparse_combiner=sparse_combiner, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = fc._LinearModelLayer.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer._units, units) self.assertEqual(new_layer._sparse_combiner, sparse_combiner) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].vocabulary_list, ('1', '2', '3')) self.assertEqual(new_layer._feature_columns[2].num_buckets, 3)
def testWeightedSparseFeaturesOOVWithNoOOVBuckets(self): """LinearClassifier with LinearSDCA with OOV features (-1 IDs).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor( values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor( # 'GB' is out of the vocabulary. values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_v2.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'CA', 'MK', 'IT', 'CN']) country_weighted_by_price = feature_column_v2.weighted_categorical_column( country, 'price') optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[country_weighted_by_price], optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def test_linear_model_with_feature_column(self): with context.eager_mode(): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = fc.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = fc.indicator_column(cat_column) dense_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') combined = sequential.Sequential( [dense_feature_layer, linear_model]) opt = gradient_descent.SGD(learning_rate=0.1) combined.compile(opt, 'mse', []) combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10) self.assertAllClose( [[0.4], [0.6], [0.9]], combined.layers[1].dense_layers[0].kernel.numpy(), atol=0.01)
def test_train_premade_widedeep_model_with_feature_layers(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = feature_column.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = feature_column.indicator_column(cat_column) # TODO(tanzheny): use emb column for dense part once b/139667019 is fixed. # emb_column = feature_column.embedding_column(cat_column, dimension=5) keras_input = keras.layers.Input(name='symbol', shape=3, dtype=dtypes.string) # build linear part with feature layer. linear_feature_layer = dense_features.DenseFeatures([ind_column]) linear_model = linear.LinearModel(units=1, name='Linear', kernel_initializer='zeros') combined_linear = keras.Sequential( [linear_feature_layer, linear_model]) # build dnn part with feature layer. dnn_feature_layer = dense_features.DenseFeatures([ind_column]) dense_layer = keras.layers.Dense(units=1, name='DNNDense', kernel_initializer='zeros') combined_dnn = keras.Sequential([dnn_feature_layer, dense_layer]) # build and compile wide deep. wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) wide_deep_model._set_inputs({'symbol': keras_input}) sgd_opt = gradient_descent.SGD(0.1) adam_opt = adam.Adam(0.1) wide_deep_model.compile([sgd_opt, adam_opt], 'mse', ['mse']) # build estimator. train_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) eval_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) est = keras_lib.model_to_estimator(keras_model=wide_deep_model, config=self._config, checkpoint_format='saver') before_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) est.train(input_fn=train_input_fn, steps=20) after_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) self.assertLess(after_eval_results['loss'], before_eval_results['loss']) self.assertLess(after_eval_results['loss'], 0.1)
def test_wide_deep_model_with_two_feature_columns(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = fc.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = fc.indicator_column(cat_column) emb_column = fc.embedding_column(cat_column, dimension=5) linear_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') combined_linear = sequential.Sequential( [linear_feature_layer, linear_model]) dnn_model = sequential.Sequential([core.Dense(units=1)]) dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column]) combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model]) wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) opt = gradient_descent.SGD(learning_rate=0.1) wide_deep_model.compile(opt, 'mse', [], run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils. should_run_tf_function()) wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10) self.assertEqual(3, linear_model.inputs[0].shape[1]) self.assertEqual(5, dnn_model.inputs[0].shape[1])
def test_saving_with_dense_features(self): cols = [ feature_column_v2.numeric_column('a'), feature_column_v2.indicator_column( feature_column_v2.categorical_column_with_vocabulary_list( 'b', ['one', 'two'])) ] input_layers = { 'a': keras.layers.Input(shape=(1, ), name='a'), 'b': keras.layers.Input(shape=(1, ), name='b', dtype='string') } fc_layer = feature_column_v2.DenseFeatures(cols)(input_layers) output = keras.layers.Dense(10)(fc_layer) model = keras.models.Model(input_layers, output) model.compile(loss=keras.losses.MSE, optimizer=keras.optimizers.RMSprop(lr=0.0001), metrics=[keras.metrics.categorical_accuracy]) config = model.to_json() loaded_model = model_config.model_from_json(config) inputs_a = np.arange(10).reshape(10, 1) inputs_b = np.arange(10).reshape(10, 1).astype('str') # Initialize tables for V1 lookup. if not context.executing_eagerly(): self.evaluate(lookup_ops.tables_initializer()) self.assertLen(loaded_model.predict({ 'a': inputs_a, 'b': inputs_b }), 10)
def sequence_categorical_column_with_vocabulary_list( key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A sequence of categorical terms where ids use an in-memory list. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python colors = sequence_categorical_column_with_vocabulary_list( key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), num_oov_buckets=2) colors_embedding = embedding_column(colors, dimension=3) columns = [colors_embedding] features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. vocabulary_list: An ordered iterable defining the vocabulary. Each feature is mapped to the index of its value (if present) in `vocabulary_list`. Must be castable to `dtype`. dtype: The type of features. Only string and integer types are supported. If `None`, it will be inferred from `vocabulary_list`. default_value: The integer ID value to return for out-of-vocabulary feature values, defaults to `-1`. This can not be specified with a positive `num_oov_buckets`. num_oov_buckets: Non-negative integer, the number of out-of-vocabulary buckets. All out-of-vocabulary inputs will be assigned IDs in the range `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a hash of the input value. A positive `num_oov_buckets` can not be specified with `default_value`. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: if `vocabulary_list` is empty, or contains duplicate keys. ValueError: `num_oov_buckets` is a negative integer. ValueError: `num_oov_buckets` and `default_value` are both specified. ValueError: if `dtype` is not integer or string. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, default_value=default_value, num_oov_buckets=num_oov_buckets))
def sequence_categorical_column_with_vocabulary_list( key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A sequence of categorical terms where ids use an in-memory list. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python colors = sequence_categorical_column_with_vocabulary_list( key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), num_oov_buckets=2) colors_embedding = embedding_column(colors, dimension=3) columns = [colors_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. vocabulary_list: An ordered iterable defining the vocabulary. Each feature is mapped to the index of its value (if present) in `vocabulary_list`. Must be castable to `dtype`. dtype: The type of features. Only string and integer types are supported. If `None`, it will be inferred from `vocabulary_list`. default_value: The integer ID value to return for out-of-vocabulary feature values, defaults to `-1`. This can not be specified with a positive `num_oov_buckets`. num_oov_buckets: Non-negative integer, the number of out-of-vocabulary buckets. All out-of-vocabulary inputs will be assigned IDs in the range `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a hash of the input value. A positive `num_oov_buckets` can not be specified with `default_value`. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: if `vocabulary_list` is empty, or contains duplicate keys. ValueError: `num_oov_buckets` is a negative integer. ValueError: `num_oov_buckets` and `default_value` are both specified. ValueError: if `dtype` is not integer or string. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, default_value=default_value, num_oov_buckets=num_oov_buckets))
def embedding_varlen(self, batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab = fc_bm.create_vocabulary(32768) path = self._write_to_temp_file("tmp", vocab) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add( keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None)) # FC implementation fc = fcv2.categorical_column_with_vocabulary_list( key="data", vocabulary_list=vocab, num_oov_buckets=1) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def testFeatureColumns(self): # TODO(b/120099662): Error with table initialization with Keras models in # graph mode. if context.executing_eagerly(): numeric = fc.numeric_column('a') bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15]) cat_vocab = fc.categorical_column_with_vocabulary_list( 'b', ['1', '2', '3']) one_hot = fc.indicator_column(cat_vocab) embedding = fc.embedding_column(cat_vocab, dimension=8) feature_layer = DenseFeatures([bucketized, one_hot, embedding]) model = keras.models.Sequential(feature_layer) features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])} predictions = model.predict(features) saved_model_dir = self._save_model_dir() model.save(saved_model_dir, save_format='tf') loaded = keras_load.load(saved_model_dir) loaded_predictions = loaded.predict(features) self.assertAllClose(predictions, loaded_predictions)
def test_train_premade_linear_model_with_dense_features(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = feature_column.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = feature_column.indicator_column(cat_column) keras_input = keras.layers.Input(name='symbol', shape=3, dtype=dtypes.string) feature_layer = dense_features.DenseFeatures([ind_column]) h = feature_layer({'symbol': keras_input}) linear_model = linear.LinearModel(units=1) h = linear_model(h) model = keras.Model(inputs=keras_input, outputs=h) opt = gradient_descent.SGD(0.1) model.compile(opt, 'mse', ['mse']) train_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) eval_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) est = keras_lib.model_to_estimator(keras_model=model, config=self._config, checkpoint_format='saver') before_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) est.train(input_fn=train_input_fn, steps=30) after_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) self.assertLess(after_eval_results['loss'], before_eval_results['loss']) self.assertLess(after_eval_results['loss'], 0.05)
def test_wide_deep_model_with_single_feature_column(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = feature_column_v2.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = feature_column_v2.indicator_column(cat_column) dense_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') dnn_model = keras.Sequential([keras.layers.Dense(units=1)]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) combined = keras.Sequential([dense_feature_layer, wide_deep_model]) opt = gradient_descent.SGD(learning_rate=0.1) combined.compile(opt, 'mse', [], run_eagerly=testing_utils.should_run_eagerly()) combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
def _get_categorical_column(params: dict) -> fc.CategoricalColumn: if 'vocabulary' in params.keys(): feature = fc.categorical_column_with_vocabulary_list(params['key'], vocabulary_list=_parse_vocabulary( params['vocabulary']), default_value=0) elif 'bucket_size' in params.keys(): feature = fc.categorical_column_with_hash_bucket(params['key'], hash_bucket_size=params['bucket_size']) elif 'file' in params.keys(): feature = fc.categorical_column_with_vocabulary_file(params['key'], vocabulary_file=params['file'], default_value=0) elif 'num_buckets' in params.keys(): feature = fc.categorical_column_with_identity(params['key'], num_buckets=params['num_buckets']) elif 'boundaries' in params.keys(): feature = fc.bucketized_column(fc.numeric_column( params['key']), boundaries=params['boundaries']) else: raise Exception("params error") return feature