def testSharedEmbeddingColumn(self): a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"]) b = fc.shared_embedding_columns([a1, a2], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_a2_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_a2_shared_embedding") # Create a sparse id tensor for a1. input_tensor_c1 = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3]) # Create a sparse id tensor for a2. input_tensor_c2 = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3]) with variable_scope.variable_scope("run_1"): b1 = feature_column_ops.input_from_feature_columns( {b[0]: input_tensor_c1}, [b[0]]) b2 = feature_column_ops.input_from_feature_columns( {b[1]: input_tensor_c2}, [b[1]]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) b1_value = b1.eval() b2_value = b2.eval() for i in range(len(b1_value)): self.assertAllClose(b1_value[i], b2_value[i]) # Test the case when a shared_embedding_name is explictly specified. d = fc.shared_embedding_columns( [a1, a2], dimension=4, combiner="mean", shared_embedding_name="my_shared_embedding") # a3 is a completely different sparse column with a1 and a2, but since the # same shared_embedding_name is passed in, a3 will have the same embedding # as a1 and a2 a3 = fc.sparse_column_with_keys("a3", [42, 1, -1000], dtype=dtypes.int32) e = fc.shared_embedding_columns( [a3], dimension=4, combiner="mean", shared_embedding_name="my_shared_embedding") with variable_scope.variable_scope("run_2"): d1 = feature_column_ops.input_from_feature_columns( {d[0]: input_tensor_c1}, [d[0]]) e1 = feature_column_ops.input_from_feature_columns( {e[0]: input_tensor_c1}, [e[0]]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) d1_value = d1.eval() e1_value = e1.eval() for i in range(len(d1_value)): self.assertAllClose(d1_value[i], e1_value[i])
def testSharedEmbeddingColumn(self): a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"]) b = fc.shared_embedding_columns([a1, a2], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_a2_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_a2_shared_embedding") # Create a sparse id tensor for a1. input_tensor_c1 = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3]) # Create a sparse id tensor for a2. input_tensor_c2 = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3]) with variable_scope.variable_scope("run_1"): b1 = feature_column_ops.input_from_feature_columns({ b[0]: input_tensor_c1 }, [b[0]]) b2 = feature_column_ops.input_from_feature_columns({ b[1]: input_tensor_c2 }, [b[1]]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) b1_value = b1.eval() b2_value = b2.eval() for i in range(len(b1_value)): self.assertAllClose(b1_value[i], b2_value[i]) # Test the case when a shared_embedding_name is explictly specified. d = fc.shared_embedding_columns( [a1, a2], dimension=4, combiner="mean", shared_embedding_name="my_shared_embedding") # a3 is a completely different sparse column with a1 and a2, but since the # same shared_embedding_name is passed in, a3 will have the same embedding # as a1 and a2 a3 = fc.sparse_column_with_keys("a3", ["cathy", "tom", "anderson"]) e = fc.shared_embedding_columns( [a3], dimension=4, combiner="mean", shared_embedding_name="my_shared_embedding") with variable_scope.variable_scope("run_2"): d1 = feature_column_ops.input_from_feature_columns({ d[0]: input_tensor_c1 }, [d[0]]) e1 = feature_column_ops.input_from_feature_columns({ e[0]: input_tensor_c1 }, [e[0]]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) d1_value = d1.eval() e1_value = e1.eval() for i in range(len(d1_value)): self.assertAllClose(d1_value[i], e1_value[i])
def testSharedEmbeddingColumnDeepCopy(self): a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"]) columns = fc.shared_embedding_columns( [a1, a2], dimension=4, combiner="mean") columns_copy = copy.deepcopy(columns) self.assertEqual( columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding") self.assertEqual( columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
def testSharedEmbeddingColumnDeepCopy(self): a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"]) columns = fc.shared_embedding_columns( [a1, a2], dimension=4, combiner="mean") columns_copy = copy.deepcopy(columns) self.assertEqual( columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding") self.assertEqual( columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
def testSharedEmbeddingColumnErrors(self): # Tries passing in a string. with self.assertRaises(TypeError): invalid_string = "Invalid string." fc.shared_embedding_columns(invalid_string, dimension=2, combiner="mean") # Tries passing in a set of sparse columns. with self.assertRaises(TypeError): invalid_set = set([ fc.sparse_column_with_keys("a", ["foo", "bar"]), fc.sparse_column_with_keys("b", ["foo", "bar"]), ]) fc.shared_embedding_columns(invalid_set, dimension=2, combiner="mean")
def testSharedEmbeddingColumnErrors(self): # Tries passing in a string. with self.assertRaises(TypeError): invalid_string = "Invalid string." fc.shared_embedding_columns(invalid_string, dimension=2, combiner="mean") # Tries passing in a set of sparse columns. with self.assertRaises(TypeError): invalid_set = set([ fc.sparse_column_with_keys("a", ["foo", "bar"]), fc.sparse_column_with_keys("b", ["foo", "bar"]), ]) fc.shared_embedding_columns(invalid_set, dimension=2, combiner="mean")
def testWeightedSparseColumnDeepCopy(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted = fc.weighted_sparse_column(ids, "weights") weighted_copy = copy.deepcopy(weighted) self.assertEqual(weighted_copy.sparse_id_column.name, "ids") self.assertEqual(weighted_copy.weight_column_name, "weights") self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
def test_exogenous_input(self): """Test that no errors are raised when using exogenous features.""" dtype = dtypes.float64 times = [1, 2, 3, 4, 5, 6] values = [[0.01], [5.10], [5.21], [0.30], [5.41], [0.50]] feature_a = [["off"], ["on"], ["on"], ["off"], ["on"], ["off"]] sparse_column_a = feature_column.sparse_column_with_keys( column_name="feature_a", keys=["on", "off"]) one_hot_a = layers.one_hot_column(sparse_id_column=sparse_column_a) regressor = estimators.StructuralEnsembleRegressor( periodicities=[], num_features=1, moving_average_order=0, exogenous_feature_columns=[one_hot_a], dtype=dtype) features = {TrainEvalFeatures.TIMES: times, TrainEvalFeatures.VALUES: values, "feature_a": feature_a} train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(features), window_size=6, batch_size=1) regressor.train(input_fn=train_input_fn, steps=1) eval_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader(features)) evaluation = regressor.evaluate(input_fn=eval_input_fn, steps=1) predict_input_fn = input_pipeline.predict_continuation_input_fn( evaluation, times=[[7, 8, 9]], exogenous_features={"feature_a": [[["on"], ["off"], ["on"]]]}) regressor.predict(input_fn=predict_input_fn)
def testOneHotColumnForWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights") self.assertEqual(one_hot.length, 3)
def testOneHotColumnDeepCopy(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) column = fc.one_hot_column(a) column_copy = copy.deepcopy(column) self.assertEqual(column_copy.sparse_id_column.name, "a") self.assertEqual(column.name, "a_one_hot") self.assertEqual(column.length, 4)
def testFloat32WeightedSparseStringColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testFloat32WeightedSparseInt32ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int32), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testWeightedSparseColumnDeepCopy(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted = fc.weighted_sparse_column(ids, "weights") weighted_copy = copy.deepcopy(weighted) self.assertEqual(weighted_copy.sparse_id_column.name, "ids") self.assertEqual(weighted_copy.weight_column_name, "weights") self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
def testFloat32WeightedSparseStringColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testFloat32WeightedSparseInt32ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int32), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testOneHotColumnDeepCopy(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) column = fc.one_hot_column(a) column_copy = copy.deepcopy(column) self.assertEqual(column_copy.sparse_id_column.name, "a") self.assertEqual(column.name, "a_one_hot") self.assertEqual(column.length, 4)
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def testMissingValueInOneHotColumnForSparseColumnWithKeys(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) one_hot = fc.one_hot_column(ids) features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])} one_hot_tensor = feature_column_ops.input_from_feature_columns( features, [one_hot]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) self.assertAllEqual([[1., 1., 0.]], one_hot_tensor.eval())
def testOneHotColumn(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) onehot_a = fc.one_hot_column(a) self.assertEqual(onehot_a.sparse_id_column.name, "a") self.assertEqual(onehot_a.length, 4) b = fc.sparse_column_with_hash_bucket( "b", hash_bucket_size=100, combiner="sum") onehot_b = fc.one_hot_column(b) self.assertEqual(onehot_b.sparse_id_column.name, "b") self.assertEqual(onehot_b.length, 100)
def testOneHotColumn(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) onehot_a = fc.one_hot_column(a) self.assertEqual(onehot_a.sparse_id_column.name, "a") self.assertEqual(onehot_a.length, 4) b = fc.sparse_column_with_hash_bucket( "b", hash_bucket_size=100, combiner="sum") onehot_b = fc.one_hot_column(b) self.assertEqual(onehot_b.sparse_id_column.name, "b") self.assertEqual(onehot_b.length, 100)
def testInt32WeightedSparseInt64ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int64), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def testSharedEmbeddingColumnDeterminism(self): # Tests determinism in auto-generated shared_embedding_name. sparse_id_columns = tuple([ fc.sparse_column_with_keys(k, ["foo", "bar"]) for k in ["07", "02", "00", "03", "05", "01", "09", "06", "04", "08"] ]) output = fc.shared_embedding_columns( sparse_id_columns, dimension=2, combiner="mean") self.assertEqual(len(output), 10) for x in output: self.assertEqual(x.shared_embedding_name, "00_01_02_plus_7_others_shared_embedding")
def testInt32WeightedSparseInt64ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int64), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def testSharedEmbeddingColumnDeterminism(self): # Tests determinism in auto-generated shared_embedding_name. sparse_id_columns = tuple([ fc.sparse_column_with_keys(k, ["foo", "bar"]) for k in ["07", "02", "00", "03", "05", "01", "09", "06", "04", "08"] ]) output = fc.shared_embedding_columns( sparse_id_columns, dimension=2, combiner="mean") self.assertEqual(len(output), 10) for x in output: self.assertEqual(x.shared_embedding_name, "00_01_02_plus_7_others_shared_embedding")
def testSparseColumnKeysDeepCopy(self): """Tests deepcopy of sparse_column_with_keys.""" column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"]) self.assertEqual("a", column.name) column_copy = copy.deepcopy(column) self.assertEqual("a", column_copy.name) self.assertEqual( fc._SparseIdLookupConfig( # pylint: disable=protected-access keys=("key0", "key1", "key2"), vocab_size=3, default_value=-1), column_copy.lookup_config) self.assertFalse(column_copy.is_integerized)
def testSparseColumnKeysDeepCopy(self): """Tests deepcopy of sparse_column_with_keys.""" column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"]) self.assertEqual("a", column.name) column_copy = copy.deepcopy(column) self.assertEqual("a", column_copy.name) self.assertEqual( fc._SparseIdLookupConfig( # pylint: disable=protected-access keys=("key0", "key1", "key2"), vocab_size=3, default_value=-1), column_copy.lookup_config) self.assertFalse(column_copy.is_integerized)
def testSharedEmbeddingColumnWithWeightedSparseColumn(self): # Tests creation of shared embeddings containing weighted sparse columns. sparse_col = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_sparse_col = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights") b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") # Tries reversing order to check compatibility condition. b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") # Tries adding two weighted columns to check compatibility between them. weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2") b = fc.shared_embedding_columns([weighted_sparse_col, weighted_sparse_col_2], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual( b[0].shared_embedding_name, "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding" ) self.assertEqual( b[1].shared_embedding_name, "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding" )
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testPrepareInputsForRnnSparseAndDense(self): num_unroll = 2 embedding_dimension = 8 dense_dimension = 2 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.], [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.], [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.], [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.], [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), 'seq_feature0': constant_op.constant([[[111., 112.], [121., 122.]], [[211., 212.], [221., 222.]], [[311., 312.], [321., 322.]]]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) seq_feature0_column = feature_column.real_valued_column( 'seq_feature0', dimension=dense_dimension) sequence_feature_columns = [seq_feature0_column, wire_cast_embedded] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testMissingValueInOneHotColumnForWeightedSparseColumn(self): # Github issue 12583 ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) features = { 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]), 'weights': constant_op.constant([[2., 4., 6.]]) } one_hot_tensor = feature_column_ops.input_from_feature_columns( features, [one_hot]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
def testPrepareInputsForRnnSparseAndDense(self): num_unroll = 2 embedding_dimension = 8 dense_dimension = 2 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.], [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.], [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.], [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.], [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), 'seq_feature0': constant_op.constant([[[111., 112.], [121., 122.]], [[211., 212.], [221., 222.]], [[311., 312.], [321., 322.]]]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) seq_feature0_column = feature_column.real_valued_column( 'seq_feature0', dimension=dense_dimension) sequence_feature_columns = [seq_feature0_column, wire_cast_embedded] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testWeightedSparseColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def testOneHotReshaping(self): """Tests reshaping behavior of `OneHotColumn`.""" id_tensor_shape = [3, 2, 4, 5] sparse_column = fc.sparse_column_with_keys( "animals", ["squirrel", "moose", "dragon", "octopus"]) one_hot = fc.one_hot_column(sparse_column) vocab_size = len(sparse_column.lookup_config.keys) id_tensor = _sparse_id_tensor(id_tensor_shape, vocab_size) for output_rank in range(1, len(id_tensor_shape) + 1): with variable_scope.variable_scope("output_rank_{}".format(output_rank)): one_hot_output = one_hot._to_dnn_input_layer( id_tensor, output_rank=output_rank) with self.test_session() as sess: one_hot_value = sess.run(one_hot_output) expected_shape = (id_tensor_shape[:output_rank - 1] + [vocab_size]) self.assertEquals(expected_shape, list(one_hot_value.shape))
def testOneHotReshaping(self): """Tests reshaping behavior of `OneHotColumn`.""" id_tensor_shape = [3, 2, 4, 5] sparse_column = fc.sparse_column_with_keys( "animals", ["squirrel", "moose", "dragon", "octopus"]) one_hot = fc.one_hot_column(sparse_column) vocab_size = len(sparse_column.lookup_config.keys) id_tensor = _sparse_id_tensor(id_tensor_shape, vocab_size) for output_rank in range(1, len(id_tensor_shape) + 1): with variable_scope.variable_scope("output_rank_{}".format(output_rank)): one_hot_output = one_hot._to_dnn_input_layer( id_tensor, output_rank=output_rank) with self.test_session() as sess: one_hot_value = sess.run(one_hot_output) expected_shape = (id_tensor_shape[:output_rank - 1] + [vocab_size]) self.assertEquals(expected_shape, list(one_hot_value.shape))
def testPrepareInputsForRnnSparse(self): num_unroll = 2 embedding_dimension = 8 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2., 2., 2.], [1., 1., 1., 1., 1., 1., 1., 1.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) ] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testPrepareInputsForRnnSparse(self): num_unroll = 2 embedding_dimension = 8 expected = [ np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1.]]), np.array([[1., 1., 1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2., 2., 2.], [1., 1., 1., 1., 1., 1., 1., 1.]]) ] sequence_features = { 'wire_cast': sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]) } wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.embedding_column( wire_cast, dimension=embedding_dimension, combiner='sum', initializer=init_ops.ones_initializer()) ] context_features = None self._test_prepare_inputs_for_rnn(sequence_features, context_features, sequence_feature_columns, num_unroll, expected)
def testWeightedSparseColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual( { "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual( { "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.string)
def gen_feature_column(self, feature_conf): feature_name = feature_conf['feature_name'] if "comment" in feature_conf: return None if "vocab_size" in feature_conf: id_feature = fc.sparse_column_with_keys( column_name=feature_name, keys=[str(i) for i in range(feature_conf['vocab_size'])]) return fc._EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], shared_embedding_name=feature_conf.get('name'), ) elif 'hash_bucket_size' in feature_conf: id_feature = tf.contrib.layers.sparse_column_with_hash_bucket( column_name=feature_name, hash_bucket_size=feature_conf['hash_bucket_size'], # use_hashmap=use_hashmap ) return fc._EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) else: return tf.contrib.layers.real_valued_column( column_name=feature_name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ], normalizer=None if 'l2_norm' not in feature_conf else lambda x: tf.nn.l2_normalize(x, dim=-1))
def testOneHotColumnForWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights") self.assertEqual(one_hot.length, 3)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testInitCrossedColumnWeightsFromCkpt(self): sparse_col_1 = fc.sparse_column_with_hash_bucket( column_name="col_1", hash_bucket_size=4) sparse_col_2 = fc.sparse_column_with_keys( column_name="col_2", keys=("foo", "bar", "baz")) sparse_col_3 = fc.sparse_column_with_keys( column_name="col_3", keys=(42, 1, -1000), dtype=dtypes.int64) crossed_col = fc.crossed_column( columns=[sparse_col_1, sparse_col_2, sparse_col_3], hash_bucket_size=4) input_tensor = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'weighted_sum_from_feature_columns' will create the crossed # column weights variable. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(crossed_col.name): # Returns looked up column weights which is same as crossed column # weights as well as actual references to weights variables. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns({ sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor, sparse_col_3.name: input_tensor }, [crossed_col], 1)) # Update the weights since default initializer initializes all weights # to 0.0. for weight in col_weights.values(): assign_op = state_ops.assign(weight[0], weight[0] + 0.5) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_crossed_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_op) saved_col_weights = col_weights[crossed_col][0].eval() save.save(sess, checkpoint_path) crossed_col_initialized = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/col_1_X_col_2_X_col_3/" "weighted_sum_from_feature_columns/" "col_1_X_col_2_X_col_3/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the crossed column weights from provided checkpoint # and return a [4, 1] tensor which is same as weights variable. Since we # won't modify weights, this should be same as 'saved_col_weights'. _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col_initialized], 1)) col_weights_from_ckpt = col_weights[crossed_col_initialized][0] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_col_weights = col_weights_from_ckpt.eval() self.assertAllClose(saved_col_weights, loaded_col_weights)
def testInitCrossedColumnWeightsFromCkpt(self): sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1", hash_bucket_size=4) sparse_col_2 = fc.sparse_column_with_keys(column_name="col_2", keys=("foo", "bar", "baz")) sparse_col_3 = fc.sparse_column_with_keys(column_name="col_3", keys=(42, 1, -1000), dtype=dtypes.int64) crossed_col = fc.crossed_column( columns=[sparse_col_1, sparse_col_2, sparse_col_3], hash_bucket_size=4) input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'weighted_sum_from_feature_columns' will create the crossed # column weights variable. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(crossed_col.name): # Returns looked up column weights which is same as crossed column # weights as well as actual references to weights variables. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor, sparse_col_3.name: input_tensor }, [crossed_col], 1)) # Update the weights since default initializer initializes all weights # to 0.0. for weight in col_weights.values(): assign_op = state_ops.assign(weight[0], weight[0] + 0.5) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_crossed_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_op) saved_col_weights = col_weights[crossed_col][0].eval() save.save(sess, checkpoint_path) crossed_col_initialized = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/col_1_X_col_2_X_col_3/" "weighted_sum_from_feature_columns/" "col_1_X_col_2_X_col_3/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the crossed column weights from provided checkpoint # and return a [4, 1] tensor which is same as weights variable. Since we # won't modify weights, this should be same as 'saved_col_weights'. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col_initialized], 1)) col_weights_from_ckpt = col_weights[crossed_col_initialized][0] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_col_weights = col_weights_from_ckpt.eval() self.assertAllClose(saved_col_weights, loaded_col_weights)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def gen_feature(feature_conf): name = feature_conf[feature_name_key] value_type = feature_conf[value_type_key] if "vocab_size" in feature_conf: id_feature = fc.sparse_column_with_keys( column_name=name, keys=range(feature_conf['vocab_size']), dtype=tf.string) return fc._EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], shared_embedding_name=feature_conf.get(feature_name_key), ) elif "hash_bucket_size" in feature_conf \ and "embedding_dimension" not in feature_conf: if value_type == "Int": id_feature = layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) return id_feature elif "embedding_dimension" in feature_conf \ and "hash_bucket_size" in feature_conf \ and "boundaries" not in feature_conf \ and "vocabulary_file" not in feature_conf: if value_type == "Int": return _EmbeddingColumn( sparse_id_column=layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ), dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None)) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], # use_hashmap=use_hashmap ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf: use_hashmap = feature_conf.get("use_hashmap", False) if value_type == "Int": raise Exception( "embedding with vocabulary_file does not support Int type") else: id_feature = fc.sparse_column_with_vocabulary_file( column_name=name, vocabulary_file=feature_conf["vocabulary_file"], num_oov_buckets=feature_conf["num_oov_buckets"], vocab_size=feature_conf["vocab_size"], ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" in feature_conf: return embedding_bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ], embedding_dimension=feature_conf["embedding_dimension"], max_norm=None, shared_name=feature_conf.get('shared_name', None), add_random=feature_conf.get('add_random', False)) elif "embedding_dimension" not in feature_conf \ and "boundaries" in feature_conf: return layers.bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ]) else: return layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ], normalizer=None if 'l2_norm' not in feature_conf else lambda x: tf.nn.l2_normalize(x, dim=-1))
def testWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
def testPrepareFeaturesForSQSS(self): mode = model_fn_lib.ModeKeys.TRAIN seq_feature_name = 'seq_feature' sparse_seq_feature_name = 'wire_cast' ctx_feature_name = 'ctx_feature' sequence_length = 4 embedding_dimension = 8 features = { sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), seq_feature_name: constant_op.constant( 1.0, shape=[sequence_length]), ctx_feature_name: constant_op.constant(2.0) } labels = constant_op.constant(5.0, shape=[sequence_length]) wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.real_valued_column( seq_feature_name, dimension=1), feature_column.embedding_column( wire_cast, dimension=embedding_dimension, initializer=init_ops.ones_initializer()) ] context_feature_columns = [ feature_column.real_valued_column( ctx_feature_name, dimension=1) ] expected_sequence = { rnn_common.RNNKeys.LABELS_KEY: np.array([5., 5., 5., 5.]), seq_feature_name: np.array([1., 1., 1., 1.]), sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), } expected_context = {ctx_feature_name: 2.} sequence, context = ssre._prepare_features_for_sqss( features, labels, mode, sequence_feature_columns, context_feature_columns) def assert_equal(expected, got): self.assertEqual(sorted(expected), sorted(got)) for k, v in expected.items(): if isinstance(v, sparse_tensor.SparseTensor): self.assertAllEqual(v.values.eval(), got[k].values) self.assertAllEqual(v.indices.eval(), got[k].indices) self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape) else: self.assertAllEqual(v, got[k]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) actual_sequence, actual_context = sess.run( [sequence, context]) assert_equal(expected_sequence, actual_sequence) assert_equal(expected_context, actual_context)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column( "real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testPrepareFeaturesForSQSS(self): mode = model_fn_lib.ModeKeys.TRAIN seq_feature_name = 'seq_feature' sparse_seq_feature_name = 'wire_cast' ctx_feature_name = 'ctx_feature' sequence_length = 4 embedding_dimension = 8 features = { sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), seq_feature_name: constant_op.constant( 1.0, shape=[sequence_length]), ctx_feature_name: constant_op.constant(2.0) } labels = constant_op.constant(5.0, shape=[sequence_length]) wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) sequence_feature_columns = [ feature_column.real_valued_column( seq_feature_name, dimension=1), feature_column.embedding_column( wire_cast, dimension=embedding_dimension, initializer=init_ops.ones_initializer()) ] context_feature_columns = [ feature_column.real_valued_column( ctx_feature_name, dimension=1) ] expected_sequence = { rnn_common.RNNKeys.LABELS_KEY: np.array([5., 5., 5., 5.]), seq_feature_name: np.array([1., 1., 1., 1.]), sparse_seq_feature_name: sparse_tensor.SparseTensor( indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1], [2, 0, 0], [2, 1, 1]], values=[ b'marlo', b'stringer', b'omar', b'stringer', b'marlo', b'marlo', b'omar' ], dense_shape=[3, 2, 2]), } expected_context = {ctx_feature_name: 2.} sequence, context = ssre._prepare_features_for_sqss( features, labels, mode, sequence_feature_columns, context_feature_columns) def assert_equal(expected, got): self.assertEqual(sorted(expected), sorted(got)) for k, v in expected.items(): if isinstance(v, sparse_tensor.SparseTensor): self.assertAllEqual(v.values.eval(), got[k].values) self.assertAllEqual(v.indices.eval(), got[k].indices) self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape) else: self.assertAllEqual(v, got[k]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(data_flow_ops.initialize_all_tables()) actual_sequence, actual_context = sess.run( [sequence, context]) assert_equal(expected_sequence, actual_sequence) assert_equal(expected_context, actual_context)
def testLearnLyrics(self): lyrics = 'if I go there will be trouble and if I stay it will be double' lyrics_list = lyrics.split() sequence_length = len(lyrics_list) vocab = set(lyrics_list) batch_size = 16 num_classes = len(vocab) num_unroll = 7 # not a divisor of sequence_length train_steps = 350 eval_steps = 30 num_units = [4] learning_rate = 0.4 accuracy_threshold = 0.65 def get_lyrics_input_fn(seed): def input_fn(): start = random_ops.random_uniform( (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed) # Concatenate lyrics_list so inputs and labels wrap when start > 0. lyrics_list_concat = lyrics_list + lyrics_list inputs_dense = array_ops.slice(lyrics_list_concat, [start], [sequence_length]) indices = array_ops.constant( [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64) dense_shape = [sequence_length, 1] inputs = sparse_tensor.SparseTensor( indices=indices, values=inputs_dense, dense_shape=dense_shape) table = lookup.string_to_index_table_from_tensor( mapping=list(vocab), default_value=-1, name='lookup') labels = table.lookup( array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length])) return {'lyrics': inputs}, labels return input_fn sequence_feature_columns = [ feature_column.embedding_column( feature_column.sparse_column_with_keys('lyrics', vocab), dimension=8) ] config = run_config.RunConfig(tf_random_seed=21212) sequence_estimator = ssre.StateSavingRnnEstimator( constants.ProblemType.CLASSIFICATION, num_units=num_units, cell_type='basic_rnn', num_unroll=num_unroll, batch_size=batch_size, sequence_feature_columns=sequence_feature_columns, num_classes=num_classes, learning_rate=learning_rate, config=config, predict_probabilities=True, queue_capacity=2 + batch_size, seed=1234) train_input_fn = get_lyrics_input_fn(seed=12321) eval_input_fn = get_lyrics_input_fn(seed=32123) sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps) evaluation = sequence_estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) accuracy = evaluation['accuracy'] self.assertGreater(accuracy, accuracy_threshold, 'Accuracy should be higher than {}; got {}'.format( accuracy_threshold, accuracy))
def testLearnLyrics(self): lyrics = 'if I go there will be trouble and if I stay it will be double' lyrics_list = lyrics.split() sequence_length = len(lyrics_list) vocab = set(lyrics_list) batch_size = 16 num_classes = len(vocab) num_unroll = 7 # not a divisor of sequence_length train_steps = 350 eval_steps = 30 num_units = [4] learning_rate = 0.4 accuracy_threshold = 0.65 def get_lyrics_input_fn(seed): def input_fn(): start = random_ops.random_uniform( (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed) # Concatenate lyrics_list so inputs and labels wrap when start > 0. lyrics_list_concat = lyrics_list + lyrics_list inputs_dense = array_ops.slice(lyrics_list_concat, [start], [sequence_length]) indices = array_ops.constant( [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64) dense_shape = [sequence_length, 1] inputs = sparse_tensor.SparseTensor( indices=indices, values=inputs_dense, dense_shape=dense_shape) table = lookup.string_to_index_table_from_tensor( mapping=list(vocab), default_value=-1, name='lookup') labels = table.lookup( array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length])) return {'lyrics': inputs}, labels return input_fn sequence_feature_columns = [ feature_column.embedding_column( feature_column.sparse_column_with_keys('lyrics', vocab), dimension=8) ] config = run_config.RunConfig(tf_random_seed=21212) sequence_estimator = ssre.StateSavingRnnEstimator( constants.ProblemType.CLASSIFICATION, num_units=num_units, cell_type='basic_rnn', num_unroll=num_unroll, batch_size=batch_size, sequence_feature_columns=sequence_feature_columns, num_classes=num_classes, learning_rate=learning_rate, config=config, predict_probabilities=True, queue_capacity=2 + batch_size, seed=1234) train_input_fn = get_lyrics_input_fn(seed=12321) eval_input_fn = get_lyrics_input_fn(seed=32123) sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps) evaluation = sequence_estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) accuracy = evaluation['accuracy'] self.assertGreater(accuracy, accuracy_threshold, 'Accuracy should be higher than {}; got {}'.format( accuracy_threshold, accuracy))
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)