def test_train_premade_widedeep_model_with_feature_layers(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = tf.feature_column.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = tf.feature_column.indicator_column(cat_column) # TODO(tanzheny): use emb column for dense part once b/139667019 is fixed. # emb_column = feature_column.embedding_column(cat_column, dimension=5) keras_input = keras.layers.Input(name='symbol', shape=3, dtype=tf.dtypes.string) # build linear part with feature layer. linear_feature_layer = dense_features.DenseFeatures([ind_column]) linear_model = linear.LinearModel(units=1, name='Linear', kernel_initializer='zeros') combined_linear = keras.Sequential( [linear_feature_layer, linear_model]) # build dnn part with feature layer. dnn_feature_layer = dense_features.DenseFeatures([ind_column]) dense_layer = keras.layers.Dense(units=1, name='DNNDense', kernel_initializer='zeros') combined_dnn = keras.Sequential([dnn_feature_layer, dense_layer]) # build and compile wide deep. wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn) wide_deep_model._set_inputs({'symbol': keras_input}) sgd_opt = gradient_descent.SGD(0.1) adam_opt = adam.Adam(0.1) wide_deep_model.compile([sgd_opt, adam_opt], 'mse', ['mse']) # build estimator. train_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) eval_input_fn = numpy_io.numpy_input_fn(x={'symbol': data}, y=y, num_epochs=20, shuffle=False) est = keras_lib.model_to_estimator(keras_model=wide_deep_model, config=self._config, checkpoint_format='saver') before_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) est.train(input_fn=train_input_fn, steps=20) after_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1) self.assertLess(after_eval_results['loss'], before_eval_results['loss']) self.assertLess(after_eval_results['loss'], 0.1)
def test_multiple_layers_with_same_shared_embedding_column(self): categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) with ops.Graph().as_default(): features = { 'aaa': sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2)), 'bbb': sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 1), dense_shape=(2, 2)), } all_cols = [embedding_column_a, embedding_column_b] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that only 1 variable gets created in this case. self.assertEqual( 1, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) self.assertItemsEqual(['aaa_bbb_shared_embedding:0'], [ v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ])
def test_with_1d_unknown_shape_sparse_tensor(self): embedding_values = ( (1., 2.), # id 0 (6., 7.), # id 1 (11., 12.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=2, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': array_ops.placeholder(dtypes.float32), 'body-style': array_ops.sparse_placeholder(dtypes.string), # This is dense tensor for the categorical_column. 'country': array_ops.placeholder(dtypes.string), } self.assertIsNone(features['price'].shape.ndims) self.assertIsNone(features['body-style'].get_shape().ndims) self.assertIsNone(features['country'].shape.ndims) price_data = np.array([11., 12.]) body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )) country_data = np.array([['US'], ['CA']]) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 2, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual( [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]], sess.run(net, feed_dict={ features['price']: price_data, features['body-style']: body_style_data, features['country']: country_data }))
def test_shared_sequence_non_sequence_into_input_layer(self): non_seq = fc.categorical_column_with_identity('non_seq', num_buckets=10) seq = sfc.sequence_categorical_column_with_identity('seq', num_buckets=10) shared_non_seq, shared_seq = fc.shared_embedding_columns_v2( [non_seq, seq], dimension=4, combiner='sum', initializer=init_ops_v2.Ones(), shared_embedding_collection_name='shared') seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) non_seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) features = {'seq': seq, 'non_seq': non_seq} # Tile the context features across the sequence features seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features) non_seq_input = dense_features.DenseFeatures([shared_non_seq ])(features) with self.cached_session() as sess: sess.run(variables.global_variables_initializer()) output_seq, output_seq_length, output_non_seq = sess.run( [seq_input, seq_length, non_seq_input]) self.assertAllEqual( output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]]) self.assertAllEqual(output_seq_length, [2, 1]) self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
def test_column_order(self): price_a = fc.numeric_column('price_a') price_b = fc.numeric_column('price_b') with ops.Graph().as_default(): features = { 'price_a': [[1.]], 'price_b': [[3.]], } net1 = df.DenseFeatures([price_a, price_b])(features) net2 = df.DenseFeatures([price_b, price_a])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 3.]], self.evaluate(net1)) self.assertAllClose([[1., 3.]], self.evaluate(net2))
def test_train_with_dense_features(self): feature_dict = { 'sex': np.int64([1, 1, 1, 1, 0]), 'cp': np.int64([0, 3, 3, 2, 1]), 'slope': np.int64([3, 2, 0, 3, 1]), } label = np.int64([0, 1, 0, 0, 0]) train_input_fn = numpy_io.numpy_input_fn(x=feature_dict, y=label, num_epochs=1, shuffle=False) feature_columns = list() input_features = dict() for feature_name, data_array in feature_dict.items(): feature_columns.append( tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_identity( key=feature_name, num_buckets=np.size(np.unique(data_array))))) input_features[feature_name] = keras.layers.Input( name=feature_name, shape=(np.size(np.unique(data_array)), ), dtype=tf.dtypes.int64) x = dense_features.DenseFeatures(feature_columns)(input_features) x = keras.layers.Dense(16, activation='relu')(x) logits = keras.layers.Dense(1, activation='linear')(x) model = keras.Model(inputs=input_features, outputs=logits) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) estimator_model = keras_lib.model_to_estimator(keras_model=model) estimator_model.train(input_fn=train_input_fn, steps=5)
def test_does_not_support_dict_columns(self): with self.assertRaisesRegexp( ValueError, 'Expected feature_columns to be iterable, found dict.'): df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})( features={ 'a': [[0]] })
def test_should_be_dense_column(self): with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'): df.DenseFeatures(feature_columns=[ fc.categorical_column_with_hash_bucket('wire_cast', 4) ])(features={ 'a': [[0]] })
def test_from_config(self, trainable, name): cols = [ fc.numeric_column('a'), fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']), dimension=2), fc.indicator_column( fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)) ] orig_layer = dense_features.DenseFeatures(cols, trainable=trainable, name=name) config = orig_layer.get_config() new_layer = dense_features.DenseFeatures.from_config(config) self.assertEqual(new_layer.name, orig_layer.name) self.assertEqual(new_layer.trainable, trainable) self.assertLen(new_layer._feature_columns, 3) self.assertEqual(new_layer._feature_columns[0].name, 'a') self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0) self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b') self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
def test_raises_if_shape_mismatch(self): price = fc.numeric_column('price', shape=2) with ops.Graph().as_default(): features = {'price': [[1.], [5.]]} with self.assertRaisesRegexp( Exception, r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'): df.DenseFeatures([price])(features)
def test_bare_column(self): with ops.Graph().as_default(): features = features = {'a': [0.]} net = df.DenseFeatures(fc.numeric_column('a'))(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0.]], self.evaluate(net))
def test_raises_if_duplicate_name(self): with self.assertRaisesRegexp( ValueError, 'Duplicate feature column name found for columns'): df.DenseFeatures(feature_columns=[ fc.numeric_column('a'), fc.numeric_column('a') ])(features={ 'a': [[0]] })
def test_fails_for_categorical_column(self): animal = fc.categorical_column_with_identity('animal', num_buckets=4) with ops.Graph().as_default(): features = { 'animal': sparse_tensor.SparseTensor( indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) } with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'): df.DenseFeatures([animal])(features)
def test_column_generator(self): with ops.Graph().as_default(): features = features = {'a': [0.], 'b': [1.]} columns = (fc.numeric_column(key) for key in features) net = df.DenseFeatures(columns)(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0., 1.]], self.evaluate(net))
def test_reshaping(self): price = fc.numeric_column('price', shape=[1, 2]) with ops.Graph().as_default(): features = {'price': [[[1., 2.]], [[5., 6.]]]} net = df.DenseFeatures([price])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=5, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': constant_op.constant([ 11., 12., ]), 'body-style': sparse_tensor.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': constant_op.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def test_multi_column(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} net = df.DenseFeatures([price1, price2])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def test_dense_feature_with_partitioner(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0), (3, 0)), values=(0, 1, 3, 2), dense_shape=(4, 4)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=4) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): offset = partition_info._var_offset[0] del shape # unused del dtype # unused if offset == 0: embedding_values = ( (1, 0), # id 0 (0, 1)) # id 1 else: embedding_values = ( (1, 1), # id 2 (2, 2)) # id 3 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures( [embedding_column], partitioner=partitioned_variables.fixed_size_partitioner(2)) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(2, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(2, len(variables)) self.assertIs(variables[0], dense_features.variables[0]) self.assertIs(variables[1], dense_features.variables[1])
def test_static_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': [[1.], [5.], [7.]], # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } with self.assertRaisesRegexp( ValueError, r'Batch size \(first dimension\) of each feature must be same.'): # pylint: disable=anomalous-backslash-in-string df.DenseFeatures([price1, price2])(features)
def test_with_rank_0_feature(self): # price has 1 dimension in dense_features price = fc.numeric_column('price') features = { 'price': constant_op.constant(0), } self.assertEqual(0, features['price'].shape.ndims) # Static rank 0 should fail with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'): df.DenseFeatures([price])(features) # Dynamic rank 0 should fail features = { 'price': array_ops.placeholder(dtypes.float32), } net = df.DenseFeatures([price])(features) self.assertEqual(1, net.shape[1]) with _initialized_session() as sess: with self.assertRaisesOpError('Feature .* cannot have rank 0'): sess.run(net, feed_dict={features['price']: np.array(1)})
def test_runtime_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: with self.assertRaisesRegexp(errors.OpError, 'Dimensions of inputs should match'): sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
def __init__( self, rnn_layer, units, sequence_feature_columns, context_feature_columns=None, activation=None, return_sequences=False, **kwargs): """Initializes a RNNModel instance. Args: rnn_layer: A Keras RNN layer. units: An int indicating the dimension of the logit layer, and of the model output. sequence_feature_columns: An iterable containing the `FeatureColumn`s that represent sequential input. All items in the set should either be sequence columns (e.g. `sequence_numeric_column`) or constructed from one (e.g. `embedding_column` with `sequence_categorical_column_*` as input). context_feature_columns: An iterable containing the `FeatureColumn`s for contextual input. The data represented by these columns will be replicated and given to the RNN at each timestep. These columns must be instances of classes derived from `DenseColumn` such as `numeric_column`, not the sequential variants. activation: Activation function to apply to the logit layer (for instance `tf.keras.activations.sigmoid`). If you don't specify anything, no activation is applied. return_sequences: A boolean indicating whether to return the last output in the output sequence, or the full sequence. **kwargs: Additional arguments. Raises: ValueError: If `units` is not an int. """ super(RNNModel, self).__init__(**kwargs) if not isinstance(units, int): raise ValueError('units must be an int. Given type: {}'.format( type(units))) self._return_sequences = return_sequences self._sequence_feature_columns = sequence_feature_columns self._context_feature_columns = context_feature_columns self._sequence_features_layer = fc.SequenceFeatures( sequence_feature_columns) self._dense_features_layer = None if context_feature_columns: self._dense_features_layer = dense_features.DenseFeatures( context_feature_columns) self._rnn_layer = rnn_layer self._logits_layer = keras_layers.Dense( units=units, activation=activation, name='logits')
def test_multiple_layers_with_same_embedding_column(self): some_sparse_column = fc.categorical_column_with_hash_bucket( 'sparse_feature', hash_bucket_size=5) some_embedding_column = fc.embedding_column( some_sparse_column, dimension=10) with ops.Graph().as_default(): features = { 'sparse_feature': [['a'], ['x']], } all_cols = [some_embedding_column] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that 2 variables get created in this case. self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) expected_var_names = [ 'dense_features/sparse_feature_embedding/embedding_weights:0', 'dense_features_1/sparse_feature_embedding/embedding_weights:0' ] self.assertItemsEqual( expected_var_names, [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
def test_crossed_column(self): a = fc.categorical_column_with_vocabulary_list( 'a', vocabulary_list=['1', '2', '3']) b = fc.categorical_column_with_vocabulary_list( 'b', vocabulary_list=['1', '2', '3']) ab = fc.crossed_column([a, b], hash_bucket_size=2) cols = [fc.indicator_column(ab)] orig_layer = dense_features.DenseFeatures(cols) config = orig_layer.get_config() new_layer = dense_features.DenseFeatures.from_config(config) self.assertLen(new_layer._feature_columns, 1) self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
def test_runtime_batch_size_matches(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: sess.run( net, feed_dict={ features['price1']: [[1.], [5.]], features['price2']: [[1.], [5.]], })
def test_cols_to_output_tensors(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): cols_dict = {} features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} dense_features = df.DenseFeatures([price1, price2]) net = dense_features(features, cols_dict) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(cols_dict[price1])) self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2])) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def test_feature_column_dense_features_gradient(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} def scale_matrix(): matrix = dense_features(features) return 2 * matrix # Sanity check: Verify that scale_matrix returns the correct output. self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) # Check that the returned gradient is correct. grad_function = backprop.implicit_grad(scale_matrix) grads_and_vars = grad_function() indexed_slice = grads_and_vars[0][0] gradient = grads_and_vars[0][0].values self.assertAllEqual([0, 1, 2], indexed_slice.indices) self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
def test_compute_output_shape(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2', shape=4) with ops.Graph().as_default(): features = { 'price1': [[1., 2.], [5., 6.]], 'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]] } dense_features = df.DenseFeatures([price1, price2]) self.assertEqual((None, 6), dense_features.compute_output_shape((None,))) net = dense_features(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], self.evaluate(net))
def test_reuses_variables(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(1, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(1, len(variables)) self.assertEqual(variables[0], dense_features.variables[0])
def DISABLED_test_train_with_dense_features_embedding(self): feature_dict = { 'sex': np.int64([1, 1, 1, 1, 0]), 'cp': np.int64([0, 3, 3, 2, 1]), 'slope': np.int64([3, 2, 0, 3, 1]), } label = np.int64([0, 1, 0, 0, 0]) train_input_fn = numpy_io.numpy_input_fn(x=feature_dict, y=label, num_epochs=1, shuffle=False) feature_columns = list() input_features = dict() for feature_name, data_array in feature_dict.items(): feature_columns.append( tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( key=feature_name, num_buckets=np.size(np.unique(data_array))), dimension=3)) input_features[feature_name] = keras.layers.Input( name=feature_name, shape=(np.size(np.unique(data_array)), ), dtype=tf.dtypes.int64) df = dense_features.DenseFeatures(feature_columns) x = df(input_features) x = keras.layers.Dense(16, activation='relu')(x) logits = keras.layers.Dense(1, activation='linear')(x) model = keras.Model(inputs=input_features, outputs=logits) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) estimator_model = keras_lib.model_to_estimator(keras_model=model) estimator_model.train(input_fn=train_input_fn, steps=5) # We assert that we find the embedding_weights variables in the dependencies # for the DenseFeatures layer. dependency_names = [x.name for x in df._checkpoint_dependencies] self.assertNotIn('embedding_weights', dependency_names) self.assertIn('cp_embedding/embedding_weights', dependency_names) self.assertIn('sex_embedding/embedding_weights', dependency_names) self.assertIn('slope_embedding/embedding_weights', dependency_names)