示例#1
0
def process_wide(input_value, wide_para, name="wide_process"):
    '''
        process_wide features (split onehot)
    '''

    with tf.name_scope(name) as scope:
        wide_list = []
        cross_list = []
        fix_add_value = 0  # for concat
        for i, wide_para in enumerate(wide_para):
            depth = wide_para[0]
            col_type = wide_para[1]

            if col_type == "stringList" or True:
                sparse_value = _process_list_column(input_value[:, i], depth)

                fix_sparse_value = tf.SparseTensor(
                    indices=sparse_value.indices,
                    values=sparse_value.values + fix_add_value,
                    dense_shape=sparse_value.dense_shape)

                #wide_value, sparse_value = _process_list_column(input_value[:,i], depth)
                #wide_value = tf.reshape(wide_value, [-1, depth])
                print("wide_value")
                print(fix_sparse_value)

                wide_list.append(sparse_value)
                cross_list.append(sparse_value)

                fix_add_value = depth
                print("add {}".format(fix_add_value))
        # else:
        #     wide_value = tf.string_to_hash_bucket_fast(input_value[:,i], depth)
        #     print(wide_value)

        #     1
        #     onehot_emb = tf.one_hot(wide_value,depth, dtype = tf.float32)
        #     print(onehot_emb)
        #     wide_list.append(onehot_emb)

        # crossing
        cross_sparse = _sparse_cross_hashed(cross_list, num_buckets=3000000)
        fix_cross_sparse = tf.SparseTensor(
            indices=cross_sparse.indices,
            values=cross_sparse.values + 40000,
            dense_shape=cross_sparse.dense_shape)

        print(cross_sparse)

        #cross_value = tf.cast(tf.sparse_to_indicator(cross_sparse, vocab_size = 500000), tf.float32)
        #cross_value = tf.reshape(cross_value, [-1, 500000])
        wide_list.append(cross_sparse)
        print(wide_list)

        wide_sparse = tf.sparse_concat(sp_inputs=wide_list, axis=1)

        #wide_indicator = tf.cast(tf.sparse_to_indicator(wide_sparse, vocab_size = 140000), tf.float32)
        #wide_indicator = tf.reshape(wide_indicator, [-1,140000])
        return wide_sparse
 def test_hashed_zero_bucket_no_hash_key(self):
     op = sparse_ops._sparse_cross_hashed([
         self._sparse_tensor([['batch1-FC1-F1']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
         self._sparse_tensor([['batch1-FC3-F1']])
     ])
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.test_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed_zero_bucket(self):
     op = sparse_ops._sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
             self._sparse_tensor([['batch1-FC3-F1']])
         ],
         hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[4847552627144134031]])
     with self.test_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed__has_no_collision(self):
     """Tests that fingerprint concatenation has no collisions."""
     # Although the last 10 bits of 359 and 1024+359 are identical.
     # As a result, all the crosses shouldn't collide.
     t1 = constant_op.constant([[359], [359 + 1024]])
     t2 = constant_op.constant([list(range(10)), list(range(10))])
     cross = sparse_ops._sparse_cross_hashed(
         [t2, t1],
         num_buckets=1024,
         hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
         values = cross_dense.eval()
         self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 def test_hashed_3x1x2(self):
     """Tests 3x1x2 permutation with hashed output."""
     op = sparse_ops._sparse_cross_hashed([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ],
                                          num_buckets=1000)
     with self.test_session() as sess:
         out = sess.run(op)
         self.assertEqual(6, len(out.values))
         self.assertAllEqual([[0, i] for i in range(6)], out.indices)
         self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
         all_values_are_different = len(out.values) == len(set(out.values))
         self.assertTrue(all_values_are_different)
示例#6
0
    def build_columns(self, items, emb_dict):
        wide_columns = []
        deep_columns = []
        print("===")
        print(self.FEATURE_CONF)
        # input assignment
        for fea in self.PARSED_COLUMNS:
            fea_config = self._check_config(self.FEATURE_CONF, fea.split("$")[0])  # if fea is real vec, split by '$'
            if fea_config is None:
                print("[build_estimator] incorrect input %s: no feature_conf." % (str(fea)))
                continue
            feature_type = self._check_config(fea_config, 'feature_type', legal_list=['sparse', 'multi_sparse', 'real'])
            model_type = self._check_config(fea_config, 'model_type', legal_list=['wide', 'deep'])
            layer = None
            # assign input type.

            if model_type == 'wide':
                if feature_type == 'sparse':
                    # wide 必须hash
                    feature_sparse = self._check_config(fea_config, 'feature_sparse', legal_list=['hash'], default='hash')
                    bucket_size = self._check_config(fea_config, 'bucket_size', default=1024)
                    print("[build_estimator] add sparse_column_with_hash_bucket, fea = %s, hash_bucket_size = %d" % (
                        str(fea), bucket_size))
                    _, onehot_emb = self.get_onehot(items[fea], bucket_size)
                    wide_columns.append(
                        tf.reshape(onehot_emb, [1, bucket_size])
                    )

                elif feature_type == 'multi_sparse':
                    # wide 必须hash
                    feature_sparse = self._check_config(fea_config, 'feature_sparse', legal_list=['hash'], default='hash')
                    bucket_size = self._check_config(fea_config, 'bucket_size', default=1024)
                    print("[build_estimator] add multi_sparse_column_with_hash_bucket, fea = %s, hash_bucket_size = %d" % (
                        str(fea), bucket_size))
                    wide_columns.append(
                        self._process_list_column(tf.reshape(items[fea], [-1]), bucket_size)
                    )
                else:
                    raise ("wide build column error!")

            elif model_type == 'deep':

                model_feed_type = self._check_config(fea_config, 'model_feed_type', legal_list=['embedding', 'onehot', 'real'],
                                                default='onehot')
                bucket_size = self._check_config(fea_config, 'bucket_size', default=1024)

                if model_feed_type == 'embedding':
                    dimension = self._check_config(fea_config, 'dimension', default=32)
                    print("[build_estimator] add embedding_column, fea = %s, hash_bucket_size = %d, dimension = %d" % (
                        str(fea), bucket_size, dimension))
                    onehot_value, onehot_emb = self.get_onehot(items[fea], bucket_size)
                    lookup_emb = tf.nn.embedding_lookup(emb_dict[fea], onehot_value)
                    deep_columns.append(
                        tf.reshape(lookup_emb, [1, dimension])
                    )


                elif model_feed_type == 'onehot':
                    print("[build_estimator] add one_hot_column, fea = %s, hash_bucket_size = %d" % (str(fea), bucket_size))
                    onehot_value, onehot_emb = self.get_onehot(items[fea], bucket_size)
                    deep_columns.append(
                        tf.reshape(onehot_emb, [1, bucket_size])
                    )

                elif model_feed_type == 'real':
                    print("[build_estimator] add real_valued_column, fea = %s" % (str(fea)))
                    deep_columns.append(
                        tf.reshape(items[fea], [1, 1])
                    )

                else:
                    print("[build_estimator] incorrect input %s: illegal model_feed_type." % (str(fea)))

            else:
                print("[build_estimator] incorrect input %s: illegal model_type" % (str(fea)))

        # crossing
        cross_sparse = _sparse_cross_hashed(wide_columns, num_buckets=3000000)

        fix_cross_sparse = tf.SparseTensor(
            indices=cross_sparse.indices,
            values=cross_sparse.values,
            dense_shape=cross_sparse.dense_shape)


        deep_line = tf.concat(deep_columns, axis=1)
        deep_line = tf.reshape(deep_line, shape=[-1])

        wide_line = fix_cross_sparse

        # print some info for the columns registered to different types
        print("[build_estimator] wide columns: %d" % (len(wide_columns)))
        # print(wide_columns)
        print("[build estimator] deep columns: %d" % (len(deep_columns)))
        # print(deep_columns)
        print("[build estimator] cross_sparse:")
        # print(cross_sparse) # crossed wide columns

        # return wide_line, deep_line
        print("[deep_line]")
        # print(deep_line)
        return deep_line