예제 #1
0
    def testWeightedSparseFeatures(self):
        """LinearClassifier with LinearSDCA and weighted sparse features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                sparse_tensor.SparseTensor(values=[2., 3., 1.],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5])
            }, constant_op.constant([[1], [0], [1]])

        country = feature_column_lib.categorical_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        country_weighted_by_price = (
            feature_column_lib.weighted_categorical_column(country, 'price'))
        optimizer = linear.LinearSDCA(example_id_column='example_id',
                                      symmetric_l2_regularization=0.01)
        classifier = linear.LinearClassifierV2(
            feature_columns=[country_weighted_by_price], optimizer=optimizer)
        classifier.train(input_fn=input_fn, steps=100)
        loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
        self.assertLess(loss, 0.2)
예제 #2
0
 def test_denylisted_column(self):
   # HashedCategoricalColumn is denylisted and so will raise an exception.
   categorical_column = fc_lib.categorical_column_with_hash_bucket(
       key='aaa', hash_bucket_size=3)
   embedding_dimension = 2
   with self.assertRaises(TypeError):
     tpu_fc.embedding_column(categorical_column, dimension=embedding_dimension)
예제 #3
0
  def test_string_input(self):
    x = {'age': np.random.random((1024, 1)),
         'cabin': np.array(['a'] * 1024)}
    y = np.random.randint(2, size=(1024, 1))
    ds1 = dataset_ops.Dataset.from_tensor_slices(x)
    ds2 = dataset_ops.Dataset.from_tensor_slices(y)
    dataset = dataset_ops.Dataset.zip((ds1, ds2)).batch(4)
    categorical_cols = [fc.categorical_column_with_hash_bucket('cabin', 10)]
    feature_cols = ([fc.numeric_column('age')]
                    + [fc.indicator_column(cc) for cc in categorical_cols])
    layers = [fc.DenseFeatures(feature_cols),
              keras.layers.Dense(128),
              keras.layers.Dense(1)]

    model = keras.models.Sequential(layers)
    model.compile(optimizer='sgd',
                  loss=keras.losses.BinaryCrossentropy())
    model.fit(dataset)
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  sequence_feature_layer = SequenceFeatures(columns)
  sequence_input, sequence_length = sequence_feature_layer(features)
  sequence_length_mask = tf.sequence_mask(sequence_length)

  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
  rnn_layer = tf.keras.layers.RNN(rnn_cell)
  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
  return fc.SequenceCategoricalColumn(
      fc.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
def sequence_categorical_column_with_hash_bucket(key,
                                                 hash_bucket_size,
                                                 dtype=dtypes.string):
    """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  sequence_feature_layer = SequenceFeatures(columns)
  sequence_input, sequence_length = sequence_feature_layer(features)
  sequence_length_mask = tf.sequence_mask(sequence_length)

  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
  rnn_layer = tf.keras.layers.RNN(rnn_cell)
  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
    return fc.SequenceCategoricalColumn(
        fc.categorical_column_with_hash_bucket(
            key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
예제 #6
0
    def testPartitionedVariables(self):
        """Tests LinearClassifier with LinearSDCA with partitioned variables."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.6], [0.8], [0.3]]),
                'sq_footage':
                constant_op.constant([[900.0], [700.0], [600.0]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[3.0], [1.0], [1.0]])
            }, constant_op.constant([[1], [0], [1]])

        price = feature_column_lib.numeric_column('price')
        sq_footage_bucket = feature_column_lib.bucketized_column(
            feature_column_lib.numeric_column('sq_footage'),
            boundaries=[650.0, 800.0])
        country = feature_column_lib.categorical_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        sq_footage_country = feature_column_lib.crossed_column(
            [sq_footage_bucket, 'country'], hash_bucket_size=10)

        optimizer = linear.LinearSDCA(example_id_column='example_id',
                                      symmetric_l2_regularization=0.01)

        classifier = linear.LinearClassifierV2(
            feature_columns=[
                price, sq_footage_bucket, country, sq_footage_country
            ],
            weight_column='weights',
            partitioner=partitioned_variables.fixed_size_partitioner(
                num_shards=2, axis=0),
            optimizer=optimizer)
        classifier.train(input_fn=input_fn, steps=100)
        loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
        self.assertLess(loss, 0.2)
예제 #7
0
    def testMixedFeaturesArbitraryWeights(self):
        """Tests LinearRegressor with LinearSDCA and a mix of features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([0.6, 0.8, 0.3]),
                'sq_footage':
                constant_op.constant([[900.0], [700.0], [600.0]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[3.0], [5.0], [7.0]])
            }, constant_op.constant([[1.55], [-1.25], [-3.0]])

        price = feature_column_lib.numeric_column('price')
        sq_footage_bucket = feature_column_lib.bucketized_column(
            feature_column_lib.numeric_column('sq_footage'),
            boundaries=[650.0, 800.0])
        country = feature_column_lib.categorical_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        sq_footage_country = feature_column_lib.crossed_column(
            [sq_footage_bucket, 'country'], hash_bucket_size=10)
        optimizer = linear.LinearSDCA(example_id_column='example_id',
                                      symmetric_l2_regularization=0.1)
        regressor = linear.LinearRegressorV2(feature_columns=[
            price, sq_footage_bucket, country, sq_footage_country
        ],
                                             weight_column='weights',
                                             optimizer=optimizer)
        regressor.train(input_fn=input_fn, steps=20)
        loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
        self.assertLess(loss, 0.05)
예제 #8
0
 def test_one_shot_prediction_head_export(self, estimator_factory):
   def _new_temp_dir():
     return os.path.join(test.get_temp_dir(), str(ops.uid()))
   model_dir = _new_temp_dir()
   categorical_column = feature_column.categorical_column_with_hash_bucket(
       key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       feature_column.numeric_column(
           "2d_exogenous_feature", shape=(2,)),
       feature_column.embedding_column(
           categorical_column=categorical_column, dimension=10)]
   estimator = estimator_factory(
       model_dir=model_dir,
       exogenous_feature_columns=exogenous_feature_columns,
       head_type=ts_head_lib.OneShotPredictionHead)
   train_features = {
       feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
           20, dtype=numpy.int64),
       feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
           20, dtype=numpy.float32)[:, None], [1, 5]),
       "2d_exogenous_feature": numpy.ones([20, 2]),
       "categorical_exogenous_feature": numpy.array(
           ["strkey"] * 20)[:, None]
   }
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(train_features), shuffle_seed=2,
       num_threads=1, batch_size=16, window_size=16)
   estimator.train(input_fn=train_input_fn, steps=5)
   result = estimator.evaluate(input_fn=train_input_fn, steps=1)
   self.assertIn("average_loss", result)
   self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_saved_model(_new_temp_dir(),
                                                  input_receiver_fn)
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                        list(signatures.signature_def.keys()))
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       six.assertCountEqual(
           self,
           [feature_keys.FilteringFeatures.TIMES,
            feature_keys.FilteringFeatures.VALUES,
            "2d_exogenous_feature",
            "categorical_exogenous_feature"],
           predict_signature.inputs.keys())
       features = {
           feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
               numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
           feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
               20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
           "2d_exogenous_feature": numpy.ones([2, 35, 2]),
           "categorical_exogenous_feature": numpy.tile(numpy.array(
               ["strkey"] * 35)[None, :, None], [2, 1, 1])
       }
       feeds = {
           graph.as_graph_element(input_value.name): features[input_key]
           for input_key, input_value in predict_signature.inputs.items()}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
   # Build a parsing input function, then make a tf.Example for it to parse.
   export_location = estimator.export_saved_model(
       _new_temp_dir(),
       estimator.build_one_shot_parsing_serving_input_receiver_fn(
           filtering_length=20, prediction_length=15))
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       example = example_pb2.Example()
       times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES]
       values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES]
       times.int64_list.value.extend(range(35))
       for i in range(20):
         values.float_list.value.extend(
             [float(i) * 2. + feature_number
              for feature_number in range(5)])
       real_feature = example.features.feature["2d_exogenous_feature"]
       categortical_feature = example.features.feature[
           "categorical_exogenous_feature"]
       for i in range(35):
         real_feature.float_list.value.extend([1, 1])
         categortical_feature.bytes_list.value.append(b"strkey")
       # Serialize the tf.Example for feeding to the Session
       examples = [example.SerializeToString()] * 2
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       ((_, input_value),) = predict_signature.inputs.items()
       feeds = {graph.as_graph_element(input_value.name): examples}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
예제 #9
0
    def testSparseFeaturesWithL1Reg(self):
        """Tests LinearRegressor with LinearSDCA and sparse features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.4], [0.6], [0.3]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[10.0], [10.0], [10.0]])
            }, constant_op.constant([[1.4], [-0.8], [2.6]])

        price = feature_column_lib.numeric_column('price')
        country = feature_column_lib.categorical_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        # Regressor with no L1 regularization.
        optimizer = linear.LinearSDCA(example_id_column='example_id',
                                      symmetric_l2_regularization=0.1)
        regressor = linear.LinearRegressorV2(feature_columns=[price, country],
                                             weight_column='weights',
                                             optimizer=optimizer)
        regressor.train(input_fn=input_fn, steps=20)
        no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
        variable_names = regressor.get_variable_names()
        self.assertIn('linear/linear_model/price/weights', variable_names)
        self.assertIn('linear/linear_model/country/weights', variable_names)
        no_l1_reg_weights = {
            'linear/linear_model/price/weights':
            regressor.get_variable_value('linear/linear_model/price/weights'),
            'linear/linear_model/country/weights':
            regressor.get_variable_value(
                'linear/linear_model/country/weights'),
        }

        # Regressor with L1 regularization.
        optimizer = linear.LinearSDCA(example_id_column='example_id',
                                      symmetric_l1_regularization=1.0,
                                      symmetric_l2_regularization=0.1)
        regressor = linear.LinearRegressorV2(feature_columns=[price, country],
                                             weight_column='weights',
                                             optimizer=optimizer)
        regressor.train(input_fn=input_fn, steps=20)
        l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
        l1_reg_weights = {
            'linear/linear_model/price/weights':
            regressor.get_variable_value('linear/linear_model/price/weights'),
            'linear/linear_model/country/weights':
            regressor.get_variable_value(
                'linear/linear_model/country/weights'),
        }

        # Unregularized loss is lower when there is no L1 regularization.
        self.assertLess(no_l1_reg_loss, l1_reg_loss)
        self.assertLess(no_l1_reg_loss, 0.05)

        # But weights returned by the regressor with L1 regularization have smaller
        # L1 norm.
        l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
        for var_name in sorted(l1_reg_weights):
            l1_reg_weights_norm += sum(
                np.absolute(l1_reg_weights[var_name].flatten()))
            no_l1_reg_weights_norm += sum(
                np.absolute(no_l1_reg_weights[var_name].flatten()))
            print('Var name: %s, value: %s' %
                  (var_name, no_l1_reg_weights[var_name].flatten()))
        self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)