def testCrossedColumnNotSupportRealValuedColumn(self): b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) with self.assertRaisesRegexp( TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, " "or _BucketizedColumn instances"): fc.crossed_column( set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
def testCrossedColumnNotSupportRealValuedColumn(self): b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) with self.assertRaisesRegexp( TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, " "or _BucketizedColumn instances"): fc.crossed_column( set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
def testMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testCrossedFeatures(self): """Tests SDCALogisticClassifier with crossed features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'language': sparse_tensor.SparseTensor( values=['english', 'italian', 'spanish'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'country': sparse_tensor.SparseTensor( values=['US', 'IT', 'MX'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]) }, constant_op.constant([[0], [0], [1]]) language = feature_column_lib.sparse_column_with_hash_bucket( 'language', hash_bucket_size=5) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_language = feature_column_lib.crossed_column( [language, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_language]) classifier.fit(input_fn=input_fn, steps=10) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testCrossedFeatures(self): """Tests SDCALogisticClassifier with crossed features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'language': sparse_tensor.SparseTensor( values=['english', 'italian', 'spanish'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'country': sparse_tensor.SparseTensor(values=['US', 'IT', 'MX'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]) }, constant_op.constant([[0], [0], [1]]) with self._single_threaded_test_session(): language = feature_column_lib.sparse_column_with_hash_bucket( 'language', hash_bucket_size=5) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_language = feature_column_lib.crossed_column( [language, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_language]) classifier.fit(input_fn=input_fn, steps=10) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = SDCALogisticClassifier(example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testCrossedColumnNameCreatesSortedNames(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4]) crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000) self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed.name, "name should be generated by sorted column names") self.assertEqual("aaa", crossed.columns[0].name) self.assertEqual("bbb", crossed.columns[1].name) self.assertEqual("cost_bucketized", crossed.columns[2].name)
def testCrossedColumnDeepCopy(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4]) crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000) crossed_copy = copy.deepcopy(crossed) self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed_copy.name, "name should be generated by sorted column names") self.assertEqual("aaa", crossed_copy.columns[0].name) self.assertEqual("bbb", crossed_copy.columns[1].name) self.assertEqual("cost_bucketized", crossed_copy.columns[2].name)
def testMixedFeaturesArbitraryWeightsPartitioned(self): """Tests SDCALinearRegressor works with a mix of features (partitioned).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], l2_regularization=1.0, weight_column_name='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0)) regressor.fit(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def testMixedFeaturesArbitraryWeightsPartitioned(self): """Tests SDCALinearRegressor works with a mix of features (partitioned).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], l2_regularization=1.0, weight_column_name='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0)) regressor.fit(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def testMixedFeatures(self): """Tests SVM classifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column.real_valued_column('price') sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) svm_classifier = svm.SVM( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], example_id_column='example_id', weight_column_name='weights', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testMixedFeatures(self): """Tests SVM classifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column.real_valued_column('price') sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) svm_classifier = svm.SVM(feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], example_id_column='example_id', weight_column_name='weights', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column( "real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testInitCrossedColumnWeightsFromCkpt(self): sparse_col_1 = fc.sparse_column_with_hash_bucket( column_name="col_1", hash_bucket_size=4) sparse_col_2 = fc.sparse_column_with_hash_bucket( column_name="col_2", hash_bucket_size=4) crossed_col = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4) input_tensor = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'weighted_sum_from_feature_columns' will create the crossed # column weights variable. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(crossed_col.name): # Returns looked up column weights which is same as crossed column # weights as well as actual references to weights variables. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns({ sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col], 1)) # Update the weights since default initializer initializes all weights # to 0.0. for weight in col_weights.values(): assign_op = state_ops.assign(weight[0], weight[0] + 0.5) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_crossed_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_op) saved_col_weights = col_weights[crossed_col][0].eval() save.save(sess, checkpoint_path) crossed_col_initialized = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/col_1_X_col_2/" "weighted_sum_from_feature_columns/" "col_1_X_col_2/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the crossed column weights from provided checkpoint # and return a [4, 1] tensor which is same as weights variable. Since we # won't modify weights, this should be same as 'saved_col_weights'. _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col_initialized], 1)) col_weights_from_ckpt = col_weights[crossed_col_initialized][0] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_col_weights = col_weights_from_ckpt.eval() self.assertAllClose(saved_col_weights, loaded_col_weights)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testInitCrossedColumnWeightsFromCkpt(self): sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1", hash_bucket_size=4) sparse_col_2 = fc.sparse_column_with_hash_bucket(column_name="col_2", hash_bucket_size=4) crossed_col = fc.crossed_column(columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4) input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'weighted_sum_from_feature_columns' will create the crossed # column weights variable. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(crossed_col.name): # Returns looked up column weights which is same as crossed column # weights as well as actual references to weights variables. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col], 1)) # Update the weights since default initializer initializes all weights # to 0.0. for weight in col_weights.values(): assign_op = state_ops.assign(weight[0], weight[0] + 0.5) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_crossed_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_op) saved_col_weights = col_weights[crossed_col][0].eval() save.save(sess, checkpoint_path) crossed_col_initialized = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/col_1_X_col_2/" "weighted_sum_from_feature_columns/" "col_1_X_col_2/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the crossed column weights from provided checkpoint # and return a [4, 1] tensor which is same as weights variable. Since we # won't modify weights, this should be same as 'saved_col_weights'. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col_initialized], 1)) col_weights_from_ckpt = col_weights[crossed_col_initialized][0] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_col_weights = col_weights_from_ckpt.eval() self.assertAllClose(saved_col_weights, loaded_col_weights)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)