Пример #1
0
def create_feature_columns():
    age = vocabulary_column('age_level', [c for c in range(1, 7)])
    gender = vocabulary_column('gender', [-1, 1])

    all_cat_cross = crossed_column([age, gender], hash_bucket_size=100)

    categorical_column = [indicator_column(age), indicator_column(gender)]

    crossed_columns = [indicator_column(all_cat_cross)]

    numerical_column = []

    range_0_20 = [c for c in range(0, 20)]

    embedding_columns = [
        embedding_column(vocabulary_column("order_cnt", range_0_20),
                         dimension=1),
        embedding_column(age, dimension=1),
        embedding_column(gender, dimension=1),
        embedding_column(all_cat_cross, dimension=10)
    ]

    wide_columns = categorical_column + crossed_columns
    deep_columns = numerical_column + embedding_columns
    return wide_columns, deep_columns
Пример #2
0
  def test_embedding_column(
      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
      expected_sequence_length):

    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
    vocabulary_size = 3
    embedding_dimension_a = 2
    embedding_values_a = (
        (1., 2.),  # id 0
        (3., 4.),  # id 1
        (5., 6.)  # id 2
    )
    embedding_dimension_b = 3
    embedding_values_b = (
        (11., 12., 13.),  # id 0
        (14., 15., 16.),  # id 1
        (17., 18., 19.)  # id 2
    )
    def _get_initializer(embedding_dimension, embedding_values):

      def _initializer(shape, dtype, partition_info=None):
        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
        self.assertEqual(dtypes.float32, dtype)
        self.assertIsNone(partition_info)
        return embedding_values
      return _initializer

    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column_a = fc.embedding_column(
        categorical_column_a,
        dimension=embedding_dimension_a,
        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    embedding_column_b = fc.embedding_column(
        categorical_column_b,
        dimension=embedding_dimension_b,
        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))

    # Test that columns are reordered alphabetically.
    sequence_input_layer = ksfc.SequenceFeatures(
        [embedding_column_b, embedding_column_a])
    input_layer, sequence_length = sequence_input_layer({
        'aaa': sparse_input_a, 'bbb': sparse_input_b,})

    self.evaluate(variables_lib.global_variables_initializer())
    weights = sequence_input_layer.weights
    self.assertCountEqual(
        ('sequence_features/aaa_embedding/embedding_weights:0',
         'sequence_features/bbb_embedding/embedding_weights:0'),
        tuple([v.name for v in weights]))
    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
    self.assertAllEqual(
        expected_sequence_length, self.evaluate(sequence_length))
Пример #3
0
    def movielen_get_fc():
        sparse_features = ["movieId", "userId", 'imdbId', 'tmdbId', 'genres']
        # movieId 59047
        # userId 162541
        # imdbId 59047
        # tmdbId 61342
        # genres 21
        ln_embedding = [59047, 162541, 59047, 61342, 21]

        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]

        dnn_feature_columns = []
        linear_feature_columns = []

        for i, feat in enumerate(sparse_features):
            dnn_feature_columns.append(
                fc.embedding_column(
                    fc.categorical_column_with_hash_bucket(feat,
                                                           ln_embedding[i],
                                                           dtype=tf.string),
                    4))
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket(feat,
                                                       ln_embedding[i],
                                                       dtype=tf.string))

        return dnn_feature_columns, linear_feature_columns
Пример #4
0
 def test_wide_deep_model_with_two_feature_columns(self):
     vocab_list = ['alpha', 'beta', 'gamma']
     vocab_val = [0.4, 0.6, 0.9]
     data = np.random.choice(vocab_list, size=256)
     y = np.zeros_like(data, dtype=np.float32)
     for vocab, val in zip(vocab_list, vocab_val):
         indices = np.where(data == vocab)
         y[indices] = val + np.random.uniform(
             low=-0.01, high=0.01, size=indices[0].shape)
     cat_column = fc.categorical_column_with_vocabulary_list(
         key='symbol', vocabulary_list=vocab_list)
     ind_column = fc.indicator_column(cat_column)
     emb_column = fc.embedding_column(cat_column, dimension=5)
     linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
     linear_model = linear.LinearModel(use_bias=False,
                                       kernel_initializer='zeros')
     combined_linear = sequential.Sequential(
         [linear_feature_layer, linear_model])
     dnn_model = sequential.Sequential([core.Dense(units=1)])
     dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
     combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
     wide_deep_model = wide_deep.WideDeepModel(combined_linear,
                                               combined_dnn)
     opt = gradient_descent.SGD(learning_rate=0.1)
     wide_deep_model.compile(opt,
                             'mse', [],
                             run_eagerly=testing_utils.should_run_eagerly(),
                             experimental_run_tf_function=testing_utils.
                             should_run_tf_function())
     wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
     self.assertEqual(3, linear_model.inputs[0].shape[1])
     self.assertEqual(5, dnn_model.inputs[0].shape[1])
Пример #5
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    embedding_size = 32768
    data = fc_bm.create_data(max_length,
                             batch_size * NUM_REPEATS,
                             embedding_size - 1,
                             dtype=int)

    # Keras implementation
    model = keras.Sequential()
    model.add(
        keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64))
    model.add(keras.layers.Embedding(embedding_size, 256))
    model.add(keras.layers.Lambda(lambda x: math_ops.reduce_mean(x, axis=-1)))

    # FC implementation
    fc = fcv2.embedding_column(fcv2.categorical_column_with_identity(
        "data", num_buckets=embedding_size - 1),
                               dimension=256)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Пример #6
0
    def test_from_config(self, trainable, name):
        cols = [
            fc.numeric_column('a'),
            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
                'b', vocabulary_list=['1', '2', '3']),
                                dimension=2),
            fc.indicator_column(
                fc.categorical_column_with_hash_bucket(key='c',
                                                       hash_bucket_size=3))
        ]
        orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
        config = orig_layer.get_config()

        new_layer = df.DenseFeatures.from_config(config)

        self.assertEqual(new_layer.name, orig_layer.name)
        self.assertEqual(new_layer.trainable, trainable)
        self.assertLen(new_layer._feature_columns, 3)
        self.assertEqual(new_layer._feature_columns[0].name, 'a')
        self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
        self.assertEqual(new_layer._feature_columns[1].categorical_column.name,
                         'b')
        self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__)
        self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__)
        self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
Пример #7
0
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids []
        # example 1, ids [2]
        # example 2, ids [0, 1]
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids []
        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(6, 2))
    expected_sequence_length = [0, 1, 2, 0, 1, 0]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = fc.embedding_column(
        categorical_column, dimension=2)

    _, sequence_length, _ = _get_sequence_dense_tensor_state(
        embedding_column, {'aaa': sparse_input})

    self.assertAllEqual(
        expected_sequence_length, self.evaluate(sequence_length))
Пример #8
0
    def test_with_1d_unknown_shape_sparse_tensor(self):
        embedding_values = (
            (1., 2.),  # id 0
            (6., 7.),  # id 1
            (11., 12.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            del shape, dtype, partition_info
            return embedding_values

        # price has 1 dimension in dense_features
        price = fc.numeric_column('price')

        # one_hot_body_style has 3 dims in dense_features.
        body_style = fc.categorical_column_with_vocabulary_list(
            'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
        one_hot_body_style = fc.indicator_column(body_style)

        # embedded_body_style has 5 dims in dense_features.
        country = fc.categorical_column_with_vocabulary_list(
            'country', vocabulary_list=['US', 'JP', 'CA'])
        embedded_country = fc.embedding_column(country,
                                               dimension=2,
                                               initializer=_initializer)

        # Provides 1-dim tensor and dense tensor.
        features = {
            'price': array_ops.placeholder(dtypes.float32),
            'body-style': array_ops.sparse_placeholder(dtypes.string),
            # This is dense tensor for the categorical_column.
            'country': array_ops.placeholder(dtypes.string),
        }
        self.assertIsNone(features['price'].shape.ndims)
        self.assertIsNone(features['body-style'].get_shape().ndims)
        self.assertIsNone(features['country'].shape.ndims)

        price_data = np.array([11., 12.])
        body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ),
                                                                   (1, )),
                                                          values=('sedan',
                                                                  'hardtop'),
                                                          dense_shape=(2, ))
        country_data = np.array([['US'], ['CA']])

        net = df.DenseFeatures([price, one_hot_body_style,
                                embedded_country])(features)
        self.assertEqual(1 + 3 + 2, net.shape[1])
        with _initialized_session() as sess:

            # Each row is formed by concatenating `embedded_body_style`,
            # `one_hot_body_style`, and `price` in order.
            self.assertAllEqual(
                [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
                sess.run(net,
                         feed_dict={
                             features['price']: price_data,
                             features['body-style']: body_style_data,
                             features['country']: country_data
                         }))
Пример #9
0
    def test_get_sequence_dense_tensor(self, inputs_args, expected):
        inputs = sparse_tensor.SparseTensorValue(**inputs_args)
        vocabulary_size = 3
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = fc.embedding_column(categorical_column,
                                               dimension=embedding_dimension,
                                               initializer=_initializer)

        embedding_lookup, _, state_manager = _get_sequence_dense_tensor_state(
            embedding_column, {'aaa': inputs})

        variables = state_manager._layer.weights
        self.evaluate(variables_lib.global_variables_initializer())
        self.assertCountEqual(('embedding_weights:0', ),
                              tuple([v.name for v in variables]))
        self.assertAllEqual(embedding_values, self.evaluate(variables[0]))
        self.assertAllEqual(expected, self.evaluate(embedding_lookup))
    def _build_feature_columns(self):
        col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
        ctx_cols = [
            fc.embedding_column(col, dimension=10),
            fc.numeric_column('float_ctx')
        ]

        identity_col = sfc.sequence_categorical_column_with_identity(
            'int_list', num_buckets=10)
        bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
            'bytes_list', hash_bucket_size=100)
        seq_cols = [
            fc.embedding_column(identity_col, dimension=10),
            fc.embedding_column(bucket_col, dimension=20)
        ]

        return ctx_cols, seq_cols
  def _build_feature_columns(self):
    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
    ctx_cols = [
        fc.embedding_column(col, dimension=10),
        fc.numeric_column('float_ctx')
    ]

    identity_col = sfc.sequence_categorical_column_with_identity(
        'int_list', num_buckets=10)
    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
        'bytes_list', hash_bucket_size=100)
    seq_cols = [
        fc.embedding_column(identity_col, dimension=10),
        fc.embedding_column(bucket_col, dimension=20)
    ]

    return ctx_cols, seq_cols
Пример #12
0
    def test_with_1d_sparse_tensor(self):
        embedding_values = (
            (1., 2., 3., 4., 5.),  # id 0
            (6., 7., 8., 9., 10.),  # id 1
            (11., 12., 13., 14., 15.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            del shape, dtype, partition_info
            return embedding_values

        # price has 1 dimension in dense_features
        price = fc.numeric_column('price')

        # one_hot_body_style has 3 dims in dense_features.
        body_style = fc.categorical_column_with_vocabulary_list(
            'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
        one_hot_body_style = fc.indicator_column(body_style)

        # embedded_body_style has 5 dims in dense_features.
        country = fc.categorical_column_with_vocabulary_list(
            'country', vocabulary_list=['US', 'JP', 'CA'])
        embedded_country = fc.embedding_column(country,
                                               dimension=5,
                                               initializer=_initializer)

        with ops.Graph().as_default():
            # Provides 1-dim tensor and dense tensor.
            features = {
                'price':
                constant_op.constant([
                    11.,
                    12.,
                ]),
                'body-style':
                sparse_tensor.SparseTensor(indices=((0, ), (1, )),
                                           values=('sedan', 'hardtop'),
                                           dense_shape=(2, )),
                # This is dense tensor for the categorical_column.
                'country':
                constant_op.constant(['CA', 'US']),
            }
            self.assertEqual(1, features['price'].shape.ndims)
            self.assertEqual(1,
                             features['body-style'].dense_shape.get_shape()[0])
            self.assertEqual(1, features['country'].shape.ndims)

            net = df.DenseFeatures(
                [price, one_hot_body_style, embedded_country])(features)
            self.assertEqual(1 + 3 + 5, net.shape[1])
            with _initialized_session() as sess:

                # Each row is formed by concatenating `embedded_body_style`,
                # `one_hot_body_style`, and `price` in order.
                self.assertAllEqual(
                    [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
                     [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
Пример #13
0
    def test_dense_feature_with_partitioner(self):
        with context.eager_mode():
            sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0),
                                                               (2, 0), (3, 0)),
                                                      values=(0, 1, 3, 2),
                                                      dense_shape=(4, 4))

            # Create feature columns (categorical and embedding).
            categorical_column = fc.categorical_column_with_identity(
                key='a', num_buckets=4)
            embedding_dimension = 2

            def _embedding_column_initializer(shape,
                                              dtype,
                                              partition_info=None):
                offset = partition_info._var_offset[0]
                del shape  # unused
                del dtype  # unused
                if offset == 0:
                    embedding_values = (
                        (1, 0),  # id 0
                        (0, 1))  # id 1
                else:
                    embedding_values = (
                        (1, 1),  # id 2
                        (2, 2))  # id 3
                return embedding_values

            embedding_column = fc.embedding_column(
                categorical_column,
                dimension=embedding_dimension,
                initializer=_embedding_column_initializer)

            dense_features = df.DenseFeatures(
                [embedding_column],
                partitioner=partitioned_variables.fixed_size_partitioner(2))
            features = {'a': sparse_input}

            inputs = dense_features(features)
            variables = dense_features.variables

            # Sanity check: test that the inputs are correct.
            self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)

            # Check that only one variable was created.
            self.assertEqual(2, len(variables))

            # Check that invoking dense_features on the same features does not create
            # additional variables
            _ = dense_features(features)
            self.assertEqual(2, len(variables))
            self.assertIs(variables[0], dense_features.variables[0])
            self.assertIs(variables[1], dense_features.variables[1])
Пример #14
0
def _replace_edl_embedding_column_with_tf(dense_features_layer):
    new_feature_columns = []
    for column in dense_features_layer._feature_columns:
        if isinstance(column, EmbeddingColumn):
            logger.info("Replace embedding_column {} from ElasticDL "
                        "version to TF version".format(column.name))
            new_column = fc_lib.embedding_column(column.categorical_column,
                                                 dimension=column.dimension)
            new_feature_columns.append(new_column)
        else:
            new_feature_columns.append(column)

    return tf.keras.layers.DenseFeatures(feature_columns=new_feature_columns,
                                         name=dense_features_layer.name)
Пример #15
0
    def test_sequence_length(self, inputs_args, expected_sequence_length):
        inputs = sparse_tensor.SparseTensorValue(**inputs_args)
        vocabulary_size = 3

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = fc.embedding_column(categorical_column, dimension=2)

        _, sequence_length, _ = _get_sequence_dense_tensor_state(
            embedding_column, {'aaa': inputs})

        sequence_length = self.evaluate(sequence_length)
        self.assertAllEqual(expected_sequence_length, sequence_length)
        self.assertEqual(np.int64, sequence_length.dtype)
Пример #16
0
  def test_get_config(self, trainable, name):
    cols = [fc.numeric_column('a'),
            fc.embedding_column(fc.categorical_column_with_identity(
                key='b', num_buckets=3), dimension=2)]
    orig_layer = fc.DenseFeatures(cols, trainable=trainable, name=name)
    config = orig_layer.get_config()

    self.assertEqual(config['name'], orig_layer.name)
    self.assertEqual(config['trainable'], trainable)
    self.assertLen(config['feature_columns'], 2)
    self.assertEqual(
        config['feature_columns'][0]['class_name'], 'NumericColumn')
    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
    self.assertEqual(
        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
Пример #17
0
    def test_feature_column_dense_features_gradient(self):
        with context.eager_mode():
            sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0),
                                                               (2, 0)),
                                                      values=(0, 1, 2),
                                                      dense_shape=(3, 3))

            # Create feature columns (categorical and embedding).
            categorical_column = fc.categorical_column_with_identity(
                key='a', num_buckets=3)
            embedding_dimension = 2

            def _embedding_column_initializer(shape,
                                              dtype,
                                              partition_info=None):
                del shape  # unused
                del dtype  # unused
                del partition_info  # unused
                embedding_values = (
                    (1, 0),  # id 0
                    (0, 1),  # id 1
                    (1, 1))  # id 2
                return embedding_values

            embedding_column = fc.embedding_column(
                categorical_column,
                dimension=embedding_dimension,
                initializer=_embedding_column_initializer)

            dense_features = df.DenseFeatures([embedding_column])
            features = {'a': sparse_input}

            def scale_matrix():
                matrix = dense_features(features)
                return 2 * matrix

            # Sanity check: Verify that scale_matrix returns the correct output.
            self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())

            # Check that the returned gradient is correct.
            grad_function = backprop.implicit_grad(scale_matrix)
            grads_and_vars = grad_function()
            indexed_slice = grads_and_vars[0][0]
            gradient = grads_and_vars[0][0].values

            self.assertAllEqual([0, 1, 2], indexed_slice.indices)
            self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
Пример #18
0
    def test_reuses_variables(self):
        with context.eager_mode():
            sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0),
                                                               (2, 0)),
                                                      values=(0, 1, 2),
                                                      dense_shape=(3, 3))

            # Create feature columns (categorical and embedding).
            categorical_column = fc.categorical_column_with_identity(
                key='a', num_buckets=3)
            embedding_dimension = 2

            def _embedding_column_initializer(shape,
                                              dtype,
                                              partition_info=None):
                del shape  # unused
                del dtype  # unused
                del partition_info  # unused
                embedding_values = (
                    (1, 0),  # id 0
                    (0, 1),  # id 1
                    (1, 1))  # id 2
                return embedding_values

            embedding_column = fc.embedding_column(
                categorical_column,
                dimension=embedding_dimension,
                initializer=_embedding_column_initializer)

            dense_features = df.DenseFeatures([embedding_column])
            features = {'a': sparse_input}

            inputs = dense_features(features)
            variables = dense_features.variables

            # Sanity check: test that the inputs are correct.
            self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)

            # Check that only one variable was created.
            self.assertEqual(1, len(variables))

            # Check that invoking dense_features on the same features does not create
            # additional variables
            _ = dense_features(features)
            self.assertEqual(1, len(variables))
            self.assertEqual(variables[0], dense_features.variables[0])
Пример #19
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    embedding_size = 32768
    data = fc_bm.create_data(max_length,
                             batch_size * NUM_REPEATS,
                             embedding_size - 1,
                             dtype=int)
    weight = array_ops.ones_like_v2(data, dtype=dt.float32)

    # Keras implementation
    data_input = keras.Input(shape=(None, ),
                             ragged=True,
                             name="data",
                             dtype=dt.int64)
    weight_input = keras.Input(shape=(None, ),
                               ragged=True,
                               name="weight",
                               dtype=dt.float32)
    embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input)
    weighted_embedding = math_ops.multiply(
        embedded_data, array_ops.expand_dims(weight_input, -1))
    reduced_embedding = math_ops.reduce_sum(weighted_embedding, axis=1)
    model = keras.Model([data_input, weight_input], reduced_embedding)

    # FC implementation
    fc = fcv2.embedding_column(fcv2.weighted_categorical_column(
        fcv2.categorical_column_with_identity("data",
                                              num_buckets=embedding_size - 1),
        weight_feature_key="weight"),
                               dimension=256)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data, "weight": weight}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Пример #20
0
    def DISABLED_test_train_with_dense_features_v2(self):
        feature_dict = {
            'sex': np.int64([1, 1, 1, 1, 0]),
            'cp': np.int64([0, 3, 3, 2, 1]),
            'slope': np.int64([3, 2, 0, 3, 1]),
        }
        label = np.int64([0, 1, 0, 0, 0])
        train_input_fn = numpy_io.numpy_input_fn(x=feature_dict,
                                                 y=label,
                                                 num_epochs=1,
                                                 shuffle=False)
        feature_columns = list()
        input_features = dict()
        for feature_name, data_array in feature_dict.items():
            feature_columns.append(
                feature_column.embedding_column(
                    feature_column.categorical_column_with_identity(
                        key=feature_name,
                        num_buckets=np.size(np.unique(data_array))),
                    dimension=3))
            input_features[feature_name] = keras.layers.Input(
                name=feature_name,
                shape=(np.size(np.unique(data_array)), ),
                dtype=dtypes.int64)

        df = dense_features_v2.DenseFeatures(feature_columns)
        x = df(input_features)
        x = keras.layers.Dense(16, activation='relu')(x)
        logits = keras.layers.Dense(1, activation='linear')(x)
        model = keras.Model(inputs=input_features, outputs=logits)

        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        estimator_model = keras_lib.model_to_estimator(keras_model=model)
        estimator_model.train(input_fn=train_input_fn, steps=5)
        # We assert that we find the embedding_weights variables in the dependencies
        # for the DenseFeatures layer.
        dependency_names = [x.name for x in df._checkpoint_dependencies]
        self.assertNotIn('embedding_weights', dependency_names)
        self.assertIn('cp_embedding/embedding_weights', dependency_names)
        self.assertIn('sex_embedding/embedding_weights', dependency_names)
        self.assertIn('slope_embedding/embedding_weights', dependency_names)
    def test_embedding_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence embedding column."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column_a = fc.embedding_column(categorical_column_a,
                                                 dimension=2)
        sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
        with self.assertRaisesRegex(
                ValueError,
                r'In embedding_column: aaa_embedding\. categorical_column must be of '
                r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
            _, _ = sequence_input_layer({'aaa': sparse_input})
Пример #22
0
    def testFeatureColumns(self):
        # TODO(b/120099662): Error with table initialization with Keras models in
        # graph mode.
        if context.executing_eagerly():
            numeric = fc.numeric_column('a')
            bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15])
            cat_vocab = fc.categorical_column_with_vocabulary_list(
                'b', ['1', '2', '3'])
            one_hot = fc.indicator_column(cat_vocab)
            embedding = fc.embedding_column(cat_vocab, dimension=8)
            feature_layer = DenseFeatures([bucketized, one_hot, embedding])
            model = keras.models.Sequential(feature_layer)

            features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])}
            predictions = model.predict(features)

            saved_model_dir = self._save_model_dir()
            model.save(saved_model_dir, save_format='tf')
            loaded = keras_load.load(saved_model_dir)
            loaded_predictions = loaded.predict(features)
            self.assertAllClose(predictions, loaded_predictions)
Пример #23
0
    def criteo_get_fc():
        ln_embedding = [
            1461, 584, 10131227, 2202608, 306, 24, 12518, 634, 4, 93146, 5684,
            8351593, 3195, 28, 14993, 5461306, 11, 5653, 2173, 4, 7046547, 18,
            16, 286181, 105, 142572
        ]

        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]
        dense = []
        for i in range(13):
            dense.append(
                fc.numeric_column("I{}".format(i),
                                  dtype=tf.int64,
                                  default_value=0))

        dnn_feature_columns = []
        linear_feature_columns = []

        dnn_feature_columns += dense
        linear_feature_columns += dense

        sparse_emb = []
        for i in range(26):
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket("C{}".format(i),
                                                       ln_embedding[i],
                                                       dtype=tf.int64))
            ids = fc.categorical_column_with_hash_bucket("C{}".format(i),
                                                         ln_embedding[i],
                                                         dtype=tf.int64)
            sparse_emb += [fc.embedding_column(ids, param.embedding_size)]
        dnn_feature_columns += sparse_emb
        return dnn_feature_columns, linear_feature_columns
Пример #24
0
  def test_multiple_layers_with_same_embedding_column(self):
    some_sparse_column = fc.categorical_column_with_hash_bucket(
        'sparse_feature', hash_bucket_size=5)
    some_embedding_column = fc.embedding_column(
        some_sparse_column, dimension=10)

    with ops.Graph().as_default():
      features = {
          'sparse_feature': [['a'], ['x']],
      }
      all_cols = [some_embedding_column]
      df.DenseFeatures(all_cols)(features)
      df.DenseFeatures(all_cols)(features)
      # Make sure that 2 variables get created in this case.
      self.assertEqual(2,
                       len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
      expected_var_names = [
          'dense_features/sparse_feature_embedding/embedding_weights:0',
          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
      ]
      self.assertItemsEqual(
          expected_var_names,
          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
Пример #25
0
    def avazu_get_fc():
        # nr samples 32747463
        ln_embedding = [
            40428967, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486,
            8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60
        ]
        param.max_bucket = find_bucket_size(ln_embedding)
        print("TensorFlow, max_bucket_size = ", param.max_bucket)
        if param.max_bucket is not None:
            ln_embedding = [
                each if each < param.max_bucket else param.max_bucket
                for each in ln_embedding
            ]

        dnn_feature_columns = []
        linear_feature_columns = []

        sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain',
                           'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                           'device_ip', 'device_model', 'device_type', 'device_conn_type', ] \
            + ['C' + str(i) for i in range(14, 22)]

        sparse_emb = []
        for i in range(len(sparse_features)):
            linear_feature_columns.append(
                fc.categorical_column_with_hash_bucket(sparse_features[i],
                                                       ln_embedding[i],
                                                       dtype=tf.string))
            ids = fc.categorical_column_with_hash_bucket(sparse_features[i],
                                                         ln_embedding[i],
                                                         dtype=tf.string)
            sparse_emb += [fc.embedding_column(ids, param.embedding_size)]

        dnn_feature_columns += sparse_emb

        return dnn_feature_columns, linear_feature_columns
Пример #26
0
def make_feature_config(num_players):
    return FeatureConfig(
        context_features=[
            fc.numeric_column(
                "public_context__starting_stack_sizes",
                shape=num_players,
                dtype=tf.int64,
            ),
            fc.embedding_column(
                tf.feature_column.categorical_column_with_vocabulary_list(
                    "private_context__hand_encoded", range(1326)),
                dimension=4,
            ),
        ],
        sequence_features=[
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "last_action__action_encoded", 22)),
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "last_action__move", 5)),
            sfc.sequence_numeric_column(
                "last_action__amount_added",
                dtype=tf.int64,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_added_percent_of_remaining",
                dtype=tf.float32,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_raised",
                dtype=tf.int64,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_raised_percent_of_pot",
                dtype=tf.float32,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__all_in_player_mask",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__stack_sizes",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__amount_to_call",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__current_player_mask",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__min_raise_amount",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__pot_size",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__street",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__is_current_player",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__current_player_offset",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "player_state__current_hand_type", 9)),
            sfc.sequence_numeric_column(
                "player_state__win_odds",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_better",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_tied",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_worse",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_better_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_tied_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_worse_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
        ],
        context_targets=[
            fc.numeric_column("public_context__num_players",
                              shape=1,
                              dtype=tf.int64),
        ],
        sequence_targets=[
            sfc.sequence_numeric_column("next_action__action_encoded",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("reward__cumulative_reward",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("public_state__pot_size",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("player_state__is_current_player",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("public_state__num_players_remaining",
                                        dtype=tf.int64,
                                        default_value=-1),
        ],
    )
Пример #27
0
    def test_dense_features(self, use_safe_embedding_lookup,
                            partition_variables):
        # Inputs.
        vocabulary_size = 4
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 4), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 5))

        # Embedding variable.
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.),  # id 2
            (9., 13.)  # id 3
        )

        def _initializer(shape, dtype, partition_info=None):
            if partition_variables:
                self.assertEqual([vocabulary_size, embedding_dimension],
                                 partition_info.full_shape)
                self.assertAllEqual((2, embedding_dimension), shape)
            else:
                self.assertAllEqual((vocabulary_size, embedding_dimension),
                                    shape)
                self.assertIsNone(partition_info)

            self.assertEqual(dtypes.float32, dtype)
            return embedding_values

        # Expected lookup result, using combiner='mean'.
        expected_lookups = (
            # example 0, ids [2], embedding = [7, 11]
            (7., 11.),
            # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
            (2., 3.5),
            # example 2, ids [], embedding = [0, 0]
            (0., 0.),
            # example 3, ids [1], embedding = [3, 5]
            (3., 5.),
        )

        # Build columns.
        categorical_column = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        partitioner = None
        if partition_variables:
            partitioner = partitioned_variables.fixed_size_partitioner(2,
                                                                       axis=0)
        with variable_scope.variable_scope('vars', partitioner=partitioner):
            embedding_column = fc.embedding_column(
                categorical_column,
                dimension=embedding_dimension,
                initializer=_initializer,
                use_safe_embedding_lookup=use_safe_embedding_lookup)

            # Provide sparse input and get dense result.
            l = df.DenseFeatures((embedding_column, ))
            dense_features = l({'aaa': sparse_input})

        # Assert expected embedding variable and lookups.
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        if partition_variables:
            self.assertCountEqual((
                'vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
                'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'
            ), tuple([v.name for v in global_vars]))
        else:
            self.assertCountEqual(
                ('vars/dense_features/aaa_embedding/embedding_weights:0', ),
                tuple([v.name for v in global_vars]))
        for v in global_vars:
            self.assertIsInstance(v, variables_lib.Variable)
        trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
        if partition_variables:
            self.assertCountEqual((
                'vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
                'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'
            ), tuple([v.name for v in trainable_vars]))
        else:
            self.assertCountEqual(
                ('vars/dense_features/aaa_embedding/embedding_weights:0', ),
                tuple([v.name for v in trainable_vars]))

        self.evaluate(variables_lib.global_variables_initializer())
        self.evaluate(lookup_ops.tables_initializer())

        self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))

        if use_safe_embedding_lookup:
            self.assertIn(
                'SparseFillEmptyRows',
                [x.type for x in ops.get_default_graph().get_operations()])
        else:
            self.assertNotIn(
                'SparseFillEmptyRows',
                [x.type for x in ops.get_default_graph().get_operations()])
Пример #28
0
    def test_dense_features_not_trainable(self):
        # Inputs.
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 4), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 5))

        # Embedding variable.
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        # Expected lookup result, using combiner='mean'.
        expected_lookups = (
            # example 0, ids [2], embedding = [7, 11]
            (7., 11.),
            # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
            (2., 3.5),
            # example 2, ids [], embedding = [0, 0]
            (0., 0.),
            # example 3, ids [1], embedding = [3, 5]
            (3., 5.),
        )

        # Build columns.
        categorical_column = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = fc.embedding_column(categorical_column,
                                               dimension=embedding_dimension,
                                               initializer=_initializer,
                                               trainable=False)

        # Provide sparse input and get dense result.
        dense_features = df.DenseFeatures((embedding_column, ))({
            'aaa':
            sparse_input
        })

        # Assert expected embedding variable and lookups.
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertCountEqual(
            ('dense_features/aaa_embedding/embedding_weights:0', ),
            tuple([v.name for v in global_vars]))
        self.assertCountEqual([],
                              ops.get_collection(
                                  ops.GraphKeys.TRAINABLE_VARIABLES))

        self.evaluate(variables_lib.global_variables_initializer())
        self.evaluate(lookup_ops.tables_initializer())

        self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))