Exemplo n.º 1
0
    def movielen_get_fc():
        sparse_features = ["movieId", "userId", 'imdbId', 'tmdbId', 'genres']
        # movieId 59047
        # userId 162541
        # imdbId 59047
        # tmdbId 61342
        # genres 21
        ln_embedding = [59047, 162541, 59047, 61342, 21]

        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]

        dnn_feature_columns = []
        linear_feature_columns = []

        for i, feat in enumerate(sparse_features):
            dnn_feature_columns.append(
                fc.embedding_column(
                    fc.categorical_column_with_hash_bucket(feat,
                                                           ln_embedding[i],
                                                           dtype=tf.string),
                    4))
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket(feat,
                                                       ln_embedding[i],
                                                       dtype=tf.string))

        return dnn_feature_columns, linear_feature_columns
Exemplo n.º 2
0
 def test_should_be_dense_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
         df.DenseFeatures(feature_columns=[
             fc.categorical_column_with_hash_bucket('wire_cast', 4)
         ])(features={
             'a': [[0]]
         })
Exemplo n.º 3
0
    def test_from_config(self, trainable, name):
        cols = [
            fc.numeric_column('a'),
            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
                'b', vocabulary_list=['1', '2', '3']),
                                dimension=2),
            fc.indicator_column(
                fc.categorical_column_with_hash_bucket(key='c',
                                                       hash_bucket_size=3))
        ]
        orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
        config = orig_layer.get_config()

        new_layer = df.DenseFeatures.from_config(config)

        self.assertEqual(new_layer.name, orig_layer.name)
        self.assertEqual(new_layer.trainable, trainable)
        self.assertLen(new_layer._feature_columns, 3)
        self.assertEqual(new_layer._feature_columns[0].name, 'a')
        self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
        self.assertEqual(new_layer._feature_columns[1].categorical_column.name,
                         'b')
        self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__)
        self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__)
        self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
Exemplo n.º 4
0
    def test_from_config(self, units, sparse_combiner, trainable, name):
        cols = [
            fc.numeric_column('a'),
            fc.categorical_column_with_vocabulary_list('b',
                                                       vocabulary_list=('1',
                                                                        '2',
                                                                        '3')),
            fc.categorical_column_with_hash_bucket(key='c', hash_bucket_size=3)
        ]
        orig_layer = fc._LinearModelLayer(cols,
                                          units=units,
                                          sparse_combiner=sparse_combiner,
                                          trainable=trainable,
                                          name=name)
        config = orig_layer.get_config()

        new_layer = fc._LinearModelLayer.from_config(config)

        self.assertEqual(new_layer.name, orig_layer.name)
        self.assertEqual(new_layer._units, units)
        self.assertEqual(new_layer._sparse_combiner, sparse_combiner)
        self.assertEqual(new_layer.trainable, trainable)
        self.assertLen(new_layer._feature_columns, 3)
        self.assertEqual(new_layer._feature_columns[0].name, 'a')
        self.assertEqual(new_layer._feature_columns[1].vocabulary_list,
                         ('1', '2', '3'))
        self.assertEqual(new_layer._feature_columns[2].num_buckets, 3)
Exemplo n.º 5
0
  def testWeightedSparseFeatures(self):
    """LinearClassifier with LinearSDCA and weighted sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              sparse_tensor.SparseTensor(
                  values=[2., 3., 1.],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5])
      }, constant_op.constant([[1], [0], [1]])

    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    country_weighted_by_price = feature_column_v2.weighted_categorical_column(
        country, 'price')
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)
    classifier = linear.LinearClassifier(
        feature_columns=[country_weighted_by_price], optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
Exemplo n.º 6
0
    def criteo_get_fc():
        ln_embedding = [
            1461, 584, 10131227, 2202608, 306, 24, 12518, 634, 4, 93146, 5684,
            8351593, 3195, 28, 14993, 5461306, 11, 5653, 2173, 4, 7046547, 18,
            16, 286181, 105, 142572
        ]

        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]
        dense = []
        for i in range(13):
            dense.append(
                fc.numeric_column("I{}".format(i),
                                  dtype=tf.int64,
                                  default_value=0))

        dnn_feature_columns = []
        linear_feature_columns = []

        dnn_feature_columns += dense
        linear_feature_columns += dense

        sparse_emb = []
        for i in range(26):
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket("C{}".format(i),
                                                       ln_embedding[i],
                                                       dtype=tf.int64))
            ids = fc.categorical_column_with_hash_bucket("C{}".format(i),
                                                         ln_embedding[i],
                                                         dtype=tf.int64)
            sparse_emb += [fc.embedding_column(ids, param.embedding_size)]
        dnn_feature_columns += sparse_emb
        return dnn_feature_columns, linear_feature_columns
Exemplo n.º 7
0
    def avazu_get_fc():
        # nr samples 32747463
        ln_embedding = [
            40428967, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486,
            8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60
        ]
        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]

        dnn_feature_columns = []
        linear_feature_columns = []

        sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain',
                           'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                           'device_ip', 'device_model', 'device_type', 'device_conn_type', ] \
            + ['C' + str(i) for i in range(14, 22)]

        sparse_emb = []
        for i in range(len(sparse_features)):
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket(sparse_features[i],
                                                       ln_embedding[i],
                                                       dtype=tf.string))
            ids = fc.categorical_column_with_hash_bucket(sparse_features[i],
                                                         ln_embedding[i],
                                                         dtype=tf.string)
            sparse_emb += [fc.embedding_column(ids, param.embedding_size)]

        dnn_feature_columns += sparse_emb

        return dnn_feature_columns, linear_feature_columns
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  sequence_feature_layer = SequenceFeatures(columns)
  sequence_input, sequence_length = sequence_feature_layer(features)
  sequence_length_mask = tf.sequence_mask(sequence_length)

  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
  rnn_layer = tf.keras.layers.RNN(rnn_cell)
  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
  return fc.SequenceCategoricalColumn(
      fc.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
  sequence_feature_layer = SequenceFeatures(columns)
  sequence_input, sequence_length = sequence_feature_layer(features)
  sequence_length_mask = tf.sequence_mask(sequence_length)

  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
  rnn_layer = tf.keras.layers.RNN(rnn_cell)
  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
  return fc.SequenceCategoricalColumn(
      fc.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
Exemplo n.º 10
0
  def testPartitionedVariables(self):
    """Tests LinearClassifier with LinearSDCA with partitioned variables."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)

    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.01)

    classifier = linear.LinearClassifier(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        partitioner=partitioned_variables.fixed_size_partitioner(
            num_shards=2, axis=0),
        optimizer=optimizer)
    classifier.train(input_fn=input_fn, steps=100)
    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.2)
Exemplo n.º 11
0
def _get_categorical_column(params: dict) -> fc.CategoricalColumn:
    if 'vocabulary' in params.keys():
        feature = fc.categorical_column_with_vocabulary_list(params['key'],
                                                             vocabulary_list=_parse_vocabulary(
                                                                 params['vocabulary']),
                                                             default_value=0)
    elif 'bucket_size' in params.keys():
        feature = fc.categorical_column_with_hash_bucket(params['key'],
                                                         hash_bucket_size=params['bucket_size'])
    elif 'file' in params.keys():
        feature = fc.categorical_column_with_vocabulary_file(params['key'],
                                                             vocabulary_file=params['file'],
                                                             default_value=0)
    elif 'num_buckets' in params.keys():
        feature = fc.categorical_column_with_identity(params['key'],
                                                      num_buckets=params['num_buckets'])
    elif 'boundaries' in params.keys():
        feature = fc.bucketized_column(fc.numeric_column(
            params['key']), boundaries=params['boundaries'])
    else:
        raise Exception("params error")

    return feature
Exemplo n.º 12
0
  def test_multiple_layers_with_same_embedding_column(self):
    some_sparse_column = fc.categorical_column_with_hash_bucket(
        'sparse_feature', hash_bucket_size=5)
    some_embedding_column = fc.embedding_column(
        some_sparse_column, dimension=10)

    with ops.Graph().as_default():
      features = {
          'sparse_feature': [['a'], ['x']],
      }
      all_cols = [some_embedding_column]
      df.DenseFeatures(all_cols)(features)
      df.DenseFeatures(all_cols)(features)
      # Make sure that 2 variables get created in this case.
      self.assertEqual(2,
                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
      expected_var_names = [
          'dense_features/sparse_feature_embedding/embedding_weights:0',
          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
      ]
      self.assertItemsEqual(
          expected_var_names,
          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
Exemplo n.º 13
0
  def testMixedFeaturesArbitraryWeights(self):
    """Tests LinearRegressor with LinearSDCA and a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([0.6, 0.8, 0.3]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    price = feature_column_v2.numeric_column('price')
    sq_footage_bucket = feature_column_v2.bucketized_column(
        feature_column_v2.numeric_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_v2.crossed_column(
        [sq_footage_bucket, 'country'], hash_bucket_size=10)
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.1)
    regressor = linear.LinearRegressor(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column='weights',
        optimizer=optimizer)
    regressor.train(input_fn=input_fn, steps=20)
    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss, 0.05)
Exemplo n.º 14
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.0)

    # Keras implementation
    model = keras.Sequential()
    model.add(
        keras.Input(shape=(max_length, ),
                    name="data",
                    ragged=True,
                    dtype=dt.string))
    model.add(hashing.Hashing(num_buckets))

    # FC implementation
    fc = fcv2.categorical_column_with_hash_bucket("data", num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Exemplo n.º 15
0
  def testSparseFeaturesWithL1Reg(self):
    """Tests LinearRegressor with LinearSDCA and sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.4], [0.6], [0.3]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[10.0], [10.0], [10.0]])
      }, constant_op.constant([[1.4], [-0.8], [2.6]])

    price = feature_column_v2.numeric_column('price')
    country = feature_column_v2.categorical_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    # Regressor with no L1 regularization.
    optimizer = linear.LinearSDCA(
        example_id_column='example_id', symmetric_l2_regularization=0.1)
    regressor = linear.LinearRegressor(
        feature_columns=[price, country],
        weight_column='weights',
        optimizer=optimizer)
    regressor.train(input_fn=input_fn, steps=20)
    no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
    variable_names = regressor.get_variable_names()
    self.assertIn('linear/linear_model/price/weights', variable_names)
    self.assertIn('linear/linear_model/country/weights', variable_names)
    no_l1_reg_weights = {
        'linear/linear_model/price/weights': regressor.get_variable_value(
            'linear/linear_model/price/weights'),
        'linear/linear_model/country/weights': regressor.get_variable_value(
            'linear/linear_model/country/weights'),
    }

    # Regressor with L1 regularization.
    optimizer = linear.LinearSDCA(
        example_id_column='example_id',
        symmetric_l1_regularization=1.0,
        symmetric_l2_regularization=0.1)
    regressor = linear.LinearRegressor(
        feature_columns=[price, country],
        weight_column='weights',
        optimizer=optimizer)
    regressor.train(input_fn=input_fn, steps=20)
    l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
    l1_reg_weights = {
        'linear/linear_model/price/weights': regressor.get_variable_value(
            'linear/linear_model/price/weights'),
        'linear/linear_model/country/weights': regressor.get_variable_value(
            'linear/linear_model/country/weights'),
    }

    # Unregularized loss is lower when there is no L1 regularization.
    self.assertLess(no_l1_reg_loss, l1_reg_loss)
    self.assertLess(no_l1_reg_loss, 0.05)

    # But weights returned by the regressor with L1 regularization have smaller
    # L1 norm.
    l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
    for var_name in sorted(l1_reg_weights):
      l1_reg_weights_norm += sum(
          np.absolute(l1_reg_weights[var_name].flatten()))
      no_l1_reg_weights_norm += sum(
          np.absolute(no_l1_reg_weights[var_name].flatten()))
      print('Var name: %s, value: %s' %
            (var_name, no_l1_reg_weights[var_name].flatten()))
    self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)