def test_does_not_support_dict_columns(self): with self.assertRaisesRegexp( ValueError, 'Expected feature_columns to be iterable, found dict.'): df.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})( features={ 'a': [[0]] })
def test_raises_if_duplicate_name(self): with self.assertRaisesRegexp( ValueError, 'Duplicate feature column name found for columns'): df.DenseFeatures(feature_columns=[ fc.numeric_column('a'), fc.numeric_column('a') ])(features={ 'a': [[0]] })
def test_bare_column(self): with ops.Graph().as_default(): features = features = {'a': [0.]} net = df.DenseFeatures(fc.numeric_column('a'))(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0.]], self.evaluate(net))
def test_raises_if_shape_mismatch(self): price = fc.numeric_column('price', shape=2) with ops.Graph().as_default(): features = {'price': [[1.], [5.]]} with self.assertRaisesRegexp( Exception, r'Cannot reshape a tensor with 2 elements to shape \[2,2\]' ): df.DenseFeatures([price])(features)
def test_column_generator(self): with ops.Graph().as_default(): features = features = {'a': [0.], 'b': [1.]} columns = (fc.numeric_column(key) for key in features) net = df.DenseFeatures(columns)(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[0., 1.]], self.evaluate(net))
def test_reshaping(self): price = fc.numeric_column('price', shape=[1, 2]) with ops.Graph().as_default(): features = {'price': [[[1., 2.]], [[5., 6.]]]} net = df.DenseFeatures([price])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
def test_with_1d_sparse_tensor(self): embedding_values = ( (1., 2., 3., 4., 5.), # id 0 (6., 7., 8., 9., 10.), # id 1 (11., 12., 13., 14., 15.) # id 2 ) def _initializer(shape, dtype, partition_info=None): del shape, dtype, partition_info return embedding_values # price has 1 dimension in dense_features price = fc.numeric_column('price') # one_hot_body_style has 3 dims in dense_features. body_style = fc.categorical_column_with_vocabulary_list( 'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan']) one_hot_body_style = fc.indicator_column(body_style) # embedded_body_style has 5 dims in dense_features. country = fc.categorical_column_with_vocabulary_list( 'country', vocabulary_list=['US', 'JP', 'CA']) embedded_country = fc.embedding_column(country, dimension=5, initializer=_initializer) # Provides 1-dim tensor and dense tensor. features = { 'price': constant_op.constant([ 11., 12., ]), 'body-style': sparse_tensor.SparseTensor(indices=((0, ), (1, )), values=('sedan', 'hardtop'), dense_shape=(2, )), # This is dense tensor for the categorical_column. 'country': constant_op.constant(['CA', 'US']), } self.assertEqual(1, features['price'].shape.ndims) self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0]) self.assertEqual(1, features['country'].shape.ndims) net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(features) self.assertEqual(1 + 3 + 5, net.shape[1]) with _initialized_session() as sess: # Each row is formed by concatenating `embedded_body_style`, # `one_hot_body_style`, and `price` in order. self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.], [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
def test_fails_for_categorical_column(self): animal = fc.categorical_column_with_identity('animal', num_buckets=4) with ops.Graph().as_default(): features = { 'animal': sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]) } with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'): df.DenseFeatures([animal])(features)
def test_multi_column(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} net = df.DenseFeatures([price1, price2])(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def test_dense_feature_with_training_arg(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') # Monkey patch the second numeric column to simulate a column that has # different behavior by mode. def training_aware_get_dense_tensor(transformation_cache, state_manager, training=None): return transformation_cache.get(price2, state_manager, training=training) def training_aware_transform_feature(transformation_cache, state_manager, training=None): input_tensor = transformation_cache.get(price2.key, state_manager, training=training) if training: return input_tensor * 10.0 else: return input_tensor * 20.0 price2.get_dense_tensor = training_aware_get_dense_tensor price2.transform_feature = training_aware_transform_feature with ops.Graph().as_default(): features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} train_mode = df.DenseFeatures([price1, price2])(features, training=True) predict_mode = df.DenseFeatures([price1, price2])(features, training=False) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2., 30.], [5., 6., 40.]], self.evaluate(train_mode)) self.assertAllClose([[1., 2., 60.], [5., 6., 80.]], self.evaluate(predict_mode))
def test_static_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': [[1.], [5.], [7.]], # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } with self.assertRaisesRegexp( ValueError, r'Batch size \(first dimension\) of each feature must be same.' ): # pylint: disable=anomalous-backslash-in-string df.DenseFeatures([price1, price2])(features)
def test_with_rank_0_feature(self): # price has 1 dimension in dense_features price = fc.numeric_column('price') features = { 'price': constant_op.constant(0), } self.assertEqual(0, features['price'].shape.ndims) # Static rank 0 should fail with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'): df.DenseFeatures([price])(features) # Dynamic rank 0 should fail features = { 'price': array_ops.placeholder(dtypes.float32), } net = df.DenseFeatures([price])(features) self.assertEqual(1, net.shape[1]) with _initialized_session() as sess: with self.assertRaisesOpError('Feature .* cannot have rank 0'): sess.run(net, feed_dict={features['price']: np.array(1)})
def test_multiple_layers_with_same_embedding_column(self): some_sparse_column = fc.categorical_column_with_hash_bucket( 'sparse_feature', hash_bucket_size=5) some_embedding_column = fc.embedding_column(some_sparse_column, dimension=10) with ops.Graph().as_default(): features = { 'sparse_feature': [['a'], ['x']], } all_cols = [some_embedding_column] df.DenseFeatures(all_cols)(features) df.DenseFeatures(all_cols)(features) # Make sure that 2 variables get created in this case. self.assertEqual( 2, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))) expected_var_names = [ 'dense_features/sparse_feature_embedding/embedding_weights:0', 'dense_features_1/sparse_feature_embedding/embedding_weights:0' ] self.assertItemsEqual(expected_var_names, [ v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ])
def test_runtime_batch_size_mismatch(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 3 'price2': [[3.], [4.]] # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: with self.assertRaisesRegexp( errors.OpError, 'Dimensions of inputs should match'): sess.run( net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
def test_feature_column_dense_features_gradient(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} def scale_matrix(): matrix = dense_features(features) return 2 * matrix # Sanity check: Verify that scale_matrix returns the correct output. self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) # Check that the returned gradient is correct. grad_function = backprop.implicit_grad(scale_matrix) grads_and_vars = grad_function() indexed_slice = grads_and_vars[0][0] gradient = grads_and_vars[0][0].values self.assertAllEqual([0, 1, 2], indexed_slice.indices) self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
def test_reuses_variables(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} inputs = dense_features(features) variables = dense_features.variables # Sanity check: test that the inputs are correct. self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs) # Check that only one variable was created. self.assertEqual(1, len(variables)) # Check that invoking dense_features on the same features does not create # additional variables _ = dense_features(features) self.assertEqual(1, len(variables)) self.assertEqual(variables[0], dense_features.variables[0])
def test_runtime_batch_size_matches(self): price1 = fc.numeric_column('price1') price2 = fc.numeric_column('price2') with ops.Graph().as_default(): features = { 'price1': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 'price2': array_ops.placeholder(dtype=dtypes.int64), # batchsize = 2 } net = df.DenseFeatures([price1, price2])(features) with _initialized_session() as sess: sess.run(net, feed_dict={ features['price1']: [[1.], [5.]], features['price2']: [[1.], [5.]], })
def test_cols_to_output_tensors(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2') with ops.Graph().as_default(): cols_dict = {} features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} dense_features = df.DenseFeatures([price1, price2]) net = dense_features(features, cols_dict) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(cols_dict[price1])) self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2])) self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
def DISABLED_test_train_with_dense_features_v2(self): feature_dict = { 'sex': np.int64([1, 1, 1, 1, 0]), 'cp': np.int64([0, 3, 3, 2, 1]), 'slope': np.int64([3, 2, 0, 3, 1]), } label = np.int64([0, 1, 0, 0, 0]) train_input_fn = numpy_io.numpy_input_fn(x=feature_dict, y=label, num_epochs=1, shuffle=False) feature_columns = list() input_features = dict() for feature_name, data_array in feature_dict.items(): feature_columns.append( tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( key=feature_name, num_buckets=np.size(np.unique(data_array))), dimension=3)) input_features[feature_name] = keras.layers.Input( name=feature_name, shape=(np.size(np.unique(data_array)), ), dtype=tf.dtypes.int64) df = dense_features_v2.DenseFeatures(feature_columns) x = df(input_features) x = keras.layers.Dense(16, activation='relu')(x) logits = keras.layers.Dense(1, activation='linear')(x) model = keras.Model(inputs=input_features, outputs=logits) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) estimator_model = keras_lib.model_to_estimator(keras_model=model) estimator_model.train(input_fn=train_input_fn, steps=5) # We assert that we find the embedding_weights variables in the dependencies # for the DenseFeatures layer. dependency_names = [x.name for x in df._checkpoint_dependencies] self.assertNotIn('embedding_weights', dependency_names) self.assertIn('cp_embedding/embedding_weights', dependency_names) self.assertIn('sex_embedding/embedding_weights', dependency_names) self.assertIn('slope_embedding/embedding_weights', dependency_names)
def test_compute_output_shape(self): price1 = fc.numeric_column('price1', shape=2) price2 = fc.numeric_column('price2', shape=4) with ops.Graph().as_default(): features = { 'price1': [[1., 2.], [5., 6.]], 'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]] } dense_features = df.DenseFeatures([price1, price2]) self.assertEqual((None, 6), dense_features.compute_output_shape((None, ))) net = dense_features(features) self.evaluate(variables_lib.global_variables_initializer()) self.evaluate(lookup_ops.tables_initializer()) self.assertAllClose( [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], self.evaluate(net))
def test_linear_model_with_feature_column(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = feature_column_v2.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = feature_column_v2.indicator_column(cat_column) dense_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel( use_bias=False, kernel_initializer='zeros') combined = keras.Sequential([dense_feature_layer, linear_model]) opt = gradient_descent.SGD(learning_rate=0.1) combined.compile(opt, 'mse', []) combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10) self.assertAllClose([[0.4], [0.6], [0.9]], combined.layers[1].dense_layers[0].kernel.numpy(), atol=0.01)
def test_wide_deep_model_with_single_feature_column(self): vocab_list = ['alpha', 'beta', 'gamma'] vocab_val = [0.4, 0.6, 0.9] data = np.random.choice(vocab_list, size=256) y = np.zeros_like(data, dtype=np.float32) for vocab, val in zip(vocab_list, vocab_val): indices = np.where(data == vocab) y[indices] = val + np.random.uniform( low=-0.01, high=0.01, size=indices[0].shape) cat_column = fc.categorical_column_with_vocabulary_list( key='symbol', vocabulary_list=vocab_list) ind_column = fc.indicator_column(cat_column) dense_feature_layer = dense_features_v2.DenseFeatures([ind_column]) linear_model = linear.LinearModel(use_bias=False, kernel_initializer='zeros') dnn_model = sequential.Sequential([core.Dense(units=1)]) wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model) combined = sequential.Sequential( [dense_feature_layer, wide_deep_model]) opt = gradient_descent.SGD(learning_rate=0.1) combined.compile(opt, 'mse', [], run_eagerly=testing_utils.should_run_eagerly()) combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
def __init__(self, units, hidden_units, feature_columns, activation_fn, dropout, batch_norm, name=None, **kwargs): super(_DNNModelV2, self).__init__(name=name, **kwargs) with ops.name_scope( 'input_from_feature_columns') as input_feature_column_scope: layer_name = input_feature_column_scope + 'input_layer' if feature_column_lib.is_feature_column_v2(feature_columns): self._input_layer = dense_features_v2.DenseFeatures( feature_columns=feature_columns, name=layer_name) else: raise ValueError( 'Received a feature column from TensorFlow v1, but this is a ' 'TensorFlow v2 Estimator. Please either use v2 feature columns ' '(accessible via tf.feature_column.* in TF 2.x) with this ' 'Estimator, or switch to a v1 Estimator for use with v1 feature ' 'columns (accessible via tf.compat.v1.estimator.* and ' 'tf.compat.v1.feature_column.*, respectively.') self._dropout = dropout self._batch_norm = batch_norm self._hidden_layers = [] self._dropout_layers = [] self._batch_norm_layers = [] self._hidden_layer_scope_names = [] for layer_id, num_hidden_units in enumerate(hidden_units): with ops.name_scope('hiddenlayer_%d' % layer_id) as hidden_layer_scope: # Get scope name without the trailing slash. hidden_shared_name = _name_from_scope_name(hidden_layer_scope) hidden_layer = keras_core.Dense( units=num_hidden_units, activation=activation_fn, kernel_initializer=tf.compat.v1.glorot_uniform_initializer(), name=hidden_shared_name) self._hidden_layer_scope_names.append(hidden_shared_name) self._hidden_layers.append(hidden_layer) if self._dropout is not None: dropout_layer = keras_core.Dropout(rate=self._dropout) self._dropout_layers.append(dropout_layer) if self._batch_norm: batch_norm_name = hidden_shared_name + '/batchnorm_%d' % layer_id # TODO(scottzhu): Change back to use BatchNormalization when the # cleanup is done. batch_norm_layer = keras_norm.BatchNormalizationBase( # The default momentum 0.99 actually crashes on certain # problem, so here we use 0.999, which is the default of # tf.contrib.layers.batch_norm. momentum=0.999, trainable=True, name=batch_norm_name) self._batch_norm_layers.append(batch_norm_layer) with ops.name_scope('logits') as logits_scope: logits_shared_name = _name_from_scope_name(logits_scope) self._logits_layer = keras_core.Dense( units=units, activation=None, kernel_initializer=tf.compat.v1.glorot_uniform_initializer(), name=logits_shared_name) self._logits_scope_name = logits_shared_name
def test_retrieving_input(self): features = {'a': [0.]} dense_features = df.DenseFeatures(fc.numeric_column('a')) inputs = self.evaluate(dense_features(features)) self.assertAllClose([[0.]], inputs)
def test_raises_if_empty_feature_columns(self): with self.assertRaisesRegexp(ValueError, 'feature_columns must not be empty'): df.DenseFeatures(feature_columns=[])(features={})