def testBucketizedFeatures(self): """Tests LinearClassifier with LinearSDCA and bucketized features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[600.0], [1000.0], [400.0]]), 'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('price'), boundaries=[500.0, 700.0]) sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0]) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[price_bucket, sq_footage_bucket], weight_column='weights', optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. max_value = 25.0 bins = np.arange(1.0, max_value) data = fc_bm.create_data( max_length, batch_size * NUM_REPEATS, 100000, dtype=float) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length,), name="data", dtype=dt.float32)) model.add(discretization.Discretization(bins)) # FC implementation fc = fcv2.bucketized_column( fcv2.numeric_column("data"), boundaries=list(bins)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data.to_tensor(default_value=0.0)} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_tensor(default_value=0.0)} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def testPartitionedVariables(self): """Tests LinearClassifier with LinearSDCA with partitioned variables.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifier( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0), optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def testFeatureColumns(self): # TODO(b/120099662): Error with table initialization with Keras models in # graph mode. if context.executing_eagerly(): numeric = fc.numeric_column('a') bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15]) cat_vocab = fc.categorical_column_with_vocabulary_list( 'b', ['1', '2', '3']) one_hot = fc.indicator_column(cat_vocab) embedding = fc.embedding_column(cat_vocab, dimension=8) feature_layer = DenseFeatures([bucketized, one_hot, embedding]) model = keras.models.Sequential(feature_layer) features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])} predictions = model.predict(features) saved_model_dir = self._save_model_dir() model.save(saved_model_dir, save_format='tf') loaded = keras_load.load(saved_model_dir) loaded_predictions = loaded.predict(features) self.assertAllClose(predictions, loaded_predictions)
def test_deserialization_deduping(self): price = fc.numeric_column('price') bucketized_price = fc.bucketized_column(price, boundaries=[0, 1]) configs = serialization.serialize_feature_columns([price, bucketized_price]) deserialized_feature_columns = serialization.deserialize_feature_columns( configs) self.assertLen(deserialized_feature_columns, 2) new_price = deserialized_feature_columns[0] new_bucketized_price = deserialized_feature_columns[1] # Ensure these are not the original objects: self.assertIsNot(price, new_price) self.assertIsNot(bucketized_price, new_bucketized_price) # But they are equivalent: self.assertEqual(price, new_price) self.assertEqual(bucketized_price, new_bucketized_price) # Check that deduping worked: self.assertIs(new_bucketized_price.source_column, new_price)
def _get_categorical_column(params: dict) -> fc.CategoricalColumn: if 'vocabulary' in params.keys(): feature = fc.categorical_column_with_vocabulary_list(params['key'], vocabulary_list=_parse_vocabulary( params['vocabulary']), default_value=0) elif 'bucket_size' in params.keys(): feature = fc.categorical_column_with_hash_bucket(params['key'], hash_bucket_size=params['bucket_size']) elif 'file' in params.keys(): feature = fc.categorical_column_with_vocabulary_file(params['key'], vocabulary_file=params['file'], default_value=0) elif 'num_buckets' in params.keys(): feature = fc.categorical_column_with_identity(params['key'], num_buckets=params['num_buckets']) elif 'boundaries' in params.keys(): feature = fc.bucketized_column(fc.numeric_column( params['key']), boundaries=params['boundaries']) else: raise Exception("params error") return feature
def testMixedFeaturesArbitraryWeights(self): """Tests LinearRegressor with LinearSDCA and a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) price = feature_column_v2.numeric_column('price') sq_footage_bucket = feature_column_v2.bucketized_column( feature_column_v2.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_v2.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_v2.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA( example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressor( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)