def testInitEmbeddingColumnWeightsFromCkpt(self): sparse_col = fc.sparse_column_with_hash_bucket( column_name="object_in_image", hash_bucket_size=4) # Create _EmbeddingColumn which randomly initializes embedding of size # [4, 16]. embedding_col = fc.embedding_column(sparse_col, dimension=16) # Creating a SparseTensor which has all the ids possible for the given # vocab. input_tensor = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'layers.input_from_feature_columns' will create the embedding # variable. Creating under scope 'run_1' so as to prevent name conflicts # when creating embedding variable for 'embedding_column_pretrained'. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(embedding_col.name): # This will return a [4, 16] tensor which is same as embedding variable. embeddings = feature_column_ops.input_from_feature_columns({ embedding_col: input_tensor }, [embedding_col]) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_embedding_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) saved_embedding = embeddings.eval() save.save(sess, checkpoint_path) embedding_col_initialized = fc.embedding_column( sparse_id_column=sparse_col, dimension=16, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/object_in_image_embedding/" "input_from_feature_columns/object" "_in_image_embedding/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the embedding from provided checkpoint and return a # [4, 16] tensor which is same as embedding variable. Since we didn't # modify embeddings, this should be same as 'saved_embedding'. pretrained_embeddings = feature_column_ops.input_from_feature_columns({ embedding_col_initialized: input_tensor }, [embedding_col_initialized]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_embedding = pretrained_embeddings.eval() self.assertAllClose(saved_embedding, loaded_embedding)
def testRegression_TensorData(self): """Tests regression using tensor data as input.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32) language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] regressor = dnn.DNNRegressor( feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=200) scores = regressor.evaluate(input_fn=_input_fn, steps=1) self.assertIn('loss', scores)
def testEmbeddingColumn(self): a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, combiner="sum") b = fc.embedding_column(a, dimension=4, combiner="mean") self.assertEqual(b.sparse_id_column.name, "aaa") self.assertEqual(b.dimension, 4) self.assertEqual(b.combiner, "mean")
def testExport(self): """Tests export model for servo.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor( values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket('language', 100) feature_columns = [ feature_column.real_valued_column('age'), feature_column.embedding_column( language, dimension=1) ] classifier = debug.DebugClassifier(config=run_config.RunConfig( tf_random_seed=1)) classifier.fit(input_fn=input_fn, steps=5) def default_input_fn(unused_estimator, examples): return feature_column_ops.parse_feature_columns_from_examples( examples, feature_columns) export_dir = tempfile.mkdtemp() classifier.export(export_dir, input_fn=default_input_fn)
def testExport(self): """Tests export model for servo.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor( values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket('language', 100) feature_columns = [ feature_column.real_valued_column('age'), feature_column.embedding_column( language, dimension=1) ] classifier = dnn.DNNClassifier( feature_columns=feature_columns, hidden_units=[3, 3]) classifier.fit(input_fn=input_fn, steps=5) export_dir = tempfile.mkdtemp() classifier.export(export_dir)
def testMultipliesGradient(self): embedding_language = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('language', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) embedding_wire = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('wire', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) params = { 'feature_columns': [embedding_language, embedding_wire], 'head': head_lib._multi_class_head(2), 'hidden_units': [1], # Set lr mult to 0. to keep embeddings constant. 'embedding_lr_multipliers': { embedding_language: 0.0 }, } features = { 'language': sparse_tensor.SparseTensor( values=['en', 'fr', 'zh'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'wire': sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), } labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32) model_ops = dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN, params) with monitored_session.MonitoredSession() as sess: language_var = dnn_linear_combined._get_embedding_variable( embedding_language, 'dnn', 'dnn/input_from_feature_columns') wire_var = dnn_linear_combined._get_embedding_variable( embedding_wire, 'dnn', 'dnn/input_from_feature_columns') for _ in range(2): _, language_value, wire_value = sess.run( [model_ops.train_op, language_var, wire_var]) initial_value = np.full_like(language_value, 0.1) self.assertTrue(np.all(np.isclose(language_value, initial_value))) self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
def testEmbeddingColumnDeepCopy(self): a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, combiner="sum") column = fc.embedding_column(a, dimension=4, combiner="mean") column_copy = copy.deepcopy(column) self.assertEqual(column_copy.name, "aaa_embedding") self.assertEqual(column_copy.sparse_id_column.name, "aaa") self.assertEqual(column_copy.dimension, 4) self.assertEqual(column_copy.combiner, "mean")
def testEmbeddingMultiplier(self): embedding_language = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('language', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) classifier = dnn.DNNClassifier( feature_columns=[embedding_language], hidden_units=[3, 3], embedding_lr_multipliers={embedding_language: 0.8}) self.assertEqual({ embedding_language: 0.8 }, classifier._estimator.params['embedding_lr_multipliers'])
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testTrainWithPartitionedVariables(self): """Tests training with partitioned variables.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1) ] tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig(tf_random_seed=1) # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn.DNNClassifier( n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=config) classifier.fit(input_fn=_input_fn, steps=5) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores)
def testPrepareInputsForRnnSparseAndDense(self): num_unroll = 2 embedding_dimension = 8 dense_dimension = 2 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.], [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.], [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.], [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.], [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), 'seq_feature0': constant_op.constant([[[111., 112.], [121., 122.]], [[211., 212.], [221., 222.]], [[311., 312.], [321., 322.]]]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) seq_feature0_column = feature_column.real_valued_column( 'seq_feature0', dimension=dense_dimension) sequence_feature_columns = [seq_feature0_column, wire_cast_embedded] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def benchmarkLogisticFloatLabel(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant(((50,), (20,), (10,))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant( ((0.8,), (0.,), (0.2,)), dtype=dtypes.float32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) n_classes = 2 classifier = dnn.DNNClassifier( n_classes=n_classes, feature_columns=(feature_column.embedding_column( lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 1000 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate( input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) # Prediction probabilities mirror the labels column, which proves that the # classifier learns from float input. self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=n_classes, expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)), expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'))
def benchmarkLogisticFloatLabel(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs(constant_op.constant( ((50, ), (20, ), (10, ))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor(values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant(((0.8, ), (0., ), (0.2, )), dtype=dtypes.float32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) n_classes = 2 classifier = dnn.DNNClassifier( n_classes=n_classes, feature_columns=(feature_column.embedding_column(lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 1000 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) # Prediction probabilities mirror the labels column, which proves that the # classifier learns from float input. self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=n_classes, expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)), expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions' ))
def testPrepareInputsForRnnSparseAndDense(self): num_unroll = 2 embedding_dimension = 8 dense_dimension = 2 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.], [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.], [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.], [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.], [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), 'seq_feature0': constant_op.constant([[[111., 112.], [121., 122.]], [[211., 212.], [221., 222.]], [[311., 312.], [321., 322.]]]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) seq_feature0_column = feature_column.real_valued_column( 'seq_feature0', dimension=dense_dimension) sequence_feature_columns = [seq_feature0_column, wire_cast_embedded] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testPredict_AsIterable(self): """Tests predict and predict_prob methods with as_iterable=True.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] classifier = dnn.DNNClassifier( n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=200) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list( classifier.predict( input_fn=predict_input_fn, as_iterable=True)) self.assertListEqual(predictions, [1, 0, 0]) predictions = list( classifier.predict_proba( input_fn=predict_input_fn, as_iterable=True)) self.assertAllClose( predictions, [[0., 1., 0.], [1., 0., 0.], [1., 0., 0.]], atol=0.3)
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def testTrainSaveLoad(self): """Tests that insures you can save and reload a trained model.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1) ] model_dir = tempfile.mkdtemp() classifier = dnn.DNNClassifier( model_dir=model_dir, n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=5) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions1 = classifier.predict(input_fn=predict_input_fn) del classifier classifier2 = dnn.DNNClassifier( model_dir=model_dir, n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) predictions2 = classifier2.predict(input_fn=predict_input_fn) self.assertEqual(list(predictions1), list(predictions2))
def testTrainSaveLoad(self): """Tests that insures you can save and reload a trained model.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[0.8], [0.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1), feature_column.real_valued_column('age') ] model_dir = tempfile.mkdtemp() regressor = dnn.DNNRegressor( model_dir=model_dir, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=5) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list(regressor.predict(input_fn=predict_input_fn)) del regressor regressor2 = dnn.DNNRegressor( model_dir=model_dir, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) predictions2 = list(regressor2.predict(input_fn=predict_input_fn)) self.assertAllClose(predictions, predictions2)
def test_make_parsing_export_strategy(self): """Only tests that an ExportStrategy instance is created.""" sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) real_valued_col1 = fc.real_valued_column("real_valued_column1") bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) feature_columns = [sparse_col, embedding_col, real_valued_col1, bucketized_col1] export_strategy = saved_model_export_utils.make_parsing_export_strategy( feature_columns=feature_columns) self.assertTrue( isinstance(export_strategy, export_strategy_lib.ExportStrategy))
def testLogisticRegression_FloatLabel(self): """Tests binary classification with float labels.""" def _input_fn_float_label(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[50], [20], [10]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } labels = constant_op.constant([[0.8], [0.], [0.2]], dtype=dtypes.float32) return features, labels language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] classifier = dnn.DNNClassifier( n_classes=2, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn_float_label, steps=50) predict_input_fn = functools.partial(_input_fn_float_label, num_epochs=1) predictions = list( classifier.predict( input_fn=predict_input_fn, as_iterable=True)) self._assertBinaryPredictions(3, predictions) predictions_proba = list( classifier.predict_proba( input_fn=predict_input_fn, as_iterable=True)) self._assertProbabilities(3, 2, predictions_proba)
def benchmarkLogisticTensorData(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs(constant_op.constant( ((.8, ), (0.2, ), (.1, ))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor(values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant(((1, ), (0, ), (0, )), dtype=dtypes.int32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) classifier = dnn.DNNClassifier( feature_columns=(feature_column.embedding_column(lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 100 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics) estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics) self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=2, expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions' ))
def benchmarkLogisticTensorData(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant(((.8,), (0.2,), (.1,))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant( ((1,), (0,), (0,)), dtype=dtypes.int32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) classifier = dnn.DNNClassifier( feature_columns=(feature_column.embedding_column( lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 100 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate( input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics) estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics) self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=2, expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions'))
def testPredict_AsIterableFalse(self): """Tests predict and predict_prob methods with as_iterable=False.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1) ] n_classes = 3 classifier = dnn.DNNClassifier( n_classes=n_classes, feature_columns=feature_columns, hidden_units=[10, 10], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=100) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores) predictions = classifier.predict(input_fn=_input_fn, as_iterable=False) self._assertBinaryPredictions(3, predictions) probabilities = classifier.predict_proba( input_fn=_input_fn, as_iterable=False) self._assertProbabilities(3, n_classes, probabilities)
def testLogisticRegression_TensorData(self): """Tests binary classification using tensor data as input.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [0.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] classifier = dnn.DNNClassifier( n_classes=2, feature_columns=feature_columns, hidden_units=[10, 10], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=50) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list( classifier.predict( input_fn=predict_input_fn, as_iterable=True)) self._assertBinaryPredictions(3, predictions)
def benchmarkPartitionedVariables(self): def _input_fn(): features = { 'language': sparse_tensor.SparseTensor( values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } labels = constant_op.constant(((1,), (0,), (0,))) return features, labels # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_feature = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) embedding_feature = feature_column.embedding_column( sparse_feature, dimension=1) tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=(sparse_feature,), dnn_feature_columns=(embedding_feature,), dnn_hidden_units=(3, 3), config=config) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate( input_fn=_input_fn, steps=100) self._assertCommonMetrics(metrics)
def testPrepareInputsForRnnSparse(self): num_unroll = 2 embedding_dimension = 8 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2., 2., 2.], [1., 1., 1., 1., 1., 1., 1., 1.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) ] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def benchmarkPartitionedVariables(self): def _input_fn(): features = { 'language': sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } labels = constant_op.constant(((1, ), (0, ), (0, ))) return features, labels # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_feature = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) embedding_feature = feature_column.embedding_column(sparse_feature, dimension=1) tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=(sparse_feature, ), dnn_feature_columns=(embedding_feature, ), dnn_hidden_units=(3, 3), config=config) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(input_fn=_input_fn, steps=100) self._assertCommonMetrics(metrics)
def testPredict_AsIterable(self): """Tests predict method with as_iterable=True.""" labels = [1., 0., 0.2] def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[0.8], [0.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant(labels, dtype=dtypes.float32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1), feature_column.real_valued_column('age') ] regressor = dnn.DNNRegressor( feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=200) scores = regressor.evaluate(input_fn=_input_fn, steps=1) self.assertIn('loss', scores) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list( regressor.predict( input_fn=predict_input_fn, as_iterable=True)) self.assertAllClose(labels, predictions, atol=0.2)
def testPrepareInputsForRnnSparse(self): num_unroll = 2 embedding_dimension = 8 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2., 2., 2.], [1., 1., 1., 1., 1., 1., 1., 1.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) ] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testDisableCenteredBias(self): """Tests that we can disable centered bias.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[0.8], [0.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1), feature_column.real_valued_column('age') ] regressor = dnn.DNNRegressor( feature_columns=feature_columns, hidden_units=[3, 3], enable_centered_bias=False, config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=5) self.assertNotIn('centered_bias_weight', regressor.get_variable_names()) scores = regressor.evaluate(input_fn=_input_fn, steps=1) self.assertIn('loss', scores)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testPrepareFeaturesForSQSS(self): mode = model_fn_lib.ModeKeys.TRAIN seq_feature_name = 'seq_feature' sparse_seq_feature_name = 'wire_cast' ctx_feature_name = 'ctx_feature' sequence_length = 4 embedding_dimension = 8 features = { sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), seq_feature_name: constant_op.constant( 1.0, shape=[sequence_length]), ctx_feature_name: constant_op.constant(2.0) } labels = constant_op.constant(5.0, shape=[sequence_length]) wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.real_valued_column( seq_feature_name, dimension=1), feature_column.embedding_column( wire_cast, dimension=embedding_dimension, initializer=init_ops.ones_initializer()) ] context_feature_columns = [ feature_column.real_valued_column( ctx_feature_name, dimension=1) ] expected_sequence = { rnn_common.RNNKeys.LABELS_KEY: np.array([5., 5., 5., 5.]), seq_feature_name: np.array([1., 1., 1., 1.]), sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), } expected_context = {ctx_feature_name: 2.} sequence, context = ssre._prepare_features_for_sqss( features, labels, mode, sequence_feature_columns, context_feature_columns) def assert_equal(expected, got): self.assertEqual(sorted(expected), sorted(got)) for k, v in expected.items(): if isinstance(v, sparse_tensor.SparseTensor): self.assertAllEqual(v.values.eval(), got[k].values) self.assertAllEqual(v.indices.eval(), got[k].indices) self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape) else: self.assertAllEqual(v, got[k]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) actual_sequence, actual_context = sess.run( [sequence, context]) assert_equal(expected_sequence, actual_sequence) assert_equal(expected_context, actual_context)
def testLearnLyrics(self): lyrics = 'if I go there will be trouble and if I stay it will be double' lyrics_list = lyrics.split() sequence_length = len(lyrics_list) vocab = set(lyrics_list) batch_size = 16 num_classes = len(vocab) num_unroll = 7 # not a divisor of sequence_length train_steps = 350 eval_steps = 30 num_units = [4] learning_rate = 0.4 accuracy_threshold = 0.65 def get_lyrics_input_fn(seed): def input_fn(): start = random_ops.random_uniform( (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed) # Concatenate lyrics_list so inputs and labels wrap when start > 0. lyrics_list_concat = lyrics_list + lyrics_list inputs_dense = array_ops.slice(lyrics_list_concat, [start], [sequence_length]) indices = array_ops.constant( [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64) dense_shape = [sequence_length, 1] inputs = sparse_tensor.SparseTensor( indices=indices, values=inputs_dense, dense_shape=dense_shape) table = lookup.string_to_index_table_from_tensor( mapping=list(vocab), default_value=-1, name='lookup') labels = table.lookup( array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length])) return {'lyrics': inputs}, labels return input_fn sequence_feature_columns = [ feature_column.embedding_column( feature_column.sparse_column_with_keys('lyrics', vocab), dimension=8) ] config = run_config.RunConfig(tf_random_seed=21212) sequence_estimator = ssre.StateSavingRnnEstimator( constants.ProblemType.CLASSIFICATION, num_units=num_units, cell_type='basic_rnn', num_unroll=num_unroll, batch_size=batch_size, sequence_feature_columns=sequence_feature_columns, num_classes=num_classes, learning_rate=learning_rate, config=config, predict_probabilities=True, queue_capacity=2 + batch_size, seed=1234) train_input_fn = get_lyrics_input_fn(seed=12321) eval_input_fn = get_lyrics_input_fn(seed=32123) sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps) evaluation = sequence_estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) accuracy = evaluation['accuracy'] self.assertGreater(accuracy, accuracy_threshold, 'Accuracy should be higher than {}; got {}'.format( accuracy_threshold, accuracy))
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column( "real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testPrepareFeaturesForSQSS(self): mode = model_fn_lib.ModeKeys.TRAIN seq_feature_name = 'seq_feature' sparse_seq_feature_name = 'wire_cast' ctx_feature_name = 'ctx_feature' sequence_length = 4 embedding_dimension = 8 features = { sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), seq_feature_name: constant_op.constant( 1.0, shape=[sequence_length]), ctx_feature_name: constant_op.constant(2.0) } labels = constant_op.constant(5.0, shape=[sequence_length]) wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.real_valued_column( seq_feature_name, dimension=1), feature_column.embedding_column( wire_cast, dimension=embedding_dimension, initializer=init_ops.ones_initializer()) ] context_feature_columns = [ feature_column.real_valued_column( ctx_feature_name, dimension=1) ] expected_sequence = { rnn_common.RNNKeys.LABELS_KEY: np.array([5., 5., 5., 5.]), seq_feature_name: np.array([1., 1., 1., 1.]), sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), } expected_context = {ctx_feature_name: 2.} sequence, context = ssre._prepare_features_for_sqss( features, labels, mode, sequence_feature_columns, context_feature_columns) def assert_equal(expected, got): self.assertEqual(sorted(expected), sorted(got)) for k, v in expected.items(): if isinstance(v, sparse_tensor.SparseTensor): self.assertAllEqual(v.values.eval(), got[k].values) self.assertAllEqual(v.indices.eval(), got[k].indices) self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape) else: self.assertAllEqual(v, got[k]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(data_flow_ops.initialize_all_tables()) actual_sequence, actual_context = sess.run( [sequence, context]) assert_equal(expected_sequence, actual_sequence) assert_equal(expected_context, actual_context)