def process_wide(input_value, wide_para, name="wide_process"): ''' process_wide features (split onehot) ''' with tf.name_scope(name) as scope: wide_list = [] cross_list = [] fix_add_value = 0 # for concat for i, wide_para in enumerate(wide_para): depth = wide_para[0] col_type = wide_para[1] if col_type == "stringList" or True: sparse_value = _process_list_column(input_value[:, i], depth) fix_sparse_value = tf.SparseTensor( indices=sparse_value.indices, values=sparse_value.values + fix_add_value, dense_shape=sparse_value.dense_shape) #wide_value, sparse_value = _process_list_column(input_value[:,i], depth) #wide_value = tf.reshape(wide_value, [-1, depth]) print("wide_value") print(fix_sparse_value) wide_list.append(sparse_value) cross_list.append(sparse_value) fix_add_value = depth print("add {}".format(fix_add_value)) # else: # wide_value = tf.string_to_hash_bucket_fast(input_value[:,i], depth) # print(wide_value) # 1 # onehot_emb = tf.one_hot(wide_value,depth, dtype = tf.float32) # print(onehot_emb) # wide_list.append(onehot_emb) # crossing cross_sparse = _sparse_cross_hashed(cross_list, num_buckets=3000000) fix_cross_sparse = tf.SparseTensor( indices=cross_sparse.indices, values=cross_sparse.values + 40000, dense_shape=cross_sparse.dense_shape) print(cross_sparse) #cross_value = tf.cast(tf.sparse_to_indicator(cross_sparse, vocab_size = 500000), tf.float32) #cross_value = tf.reshape(cross_value, [-1, 500000]) wide_list.append(cross_sparse) print(wide_list) wide_sparse = tf.sparse_concat(sp_inputs=wide_list, axis=1) #wide_indicator = tf.cast(tf.sparse_to_indicator(wide_sparse, vocab_size = 140000), tf.float32) #wide_indicator = tf.reshape(wide_indicator, [-1,140000]) return wide_sparse
def test_hashed_zero_bucket_no_hash_key(self): op = sparse_ops._sparse_cross_hashed([ self._sparse_tensor([['batch1-FC1-F1']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1']]) ]) # Check actual hashed output to prevent unintentional hashing changes. expected_out = self._sparse_tensor([[1971693436396284976]]) with self.test_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_hashed_zero_bucket(self): op = sparse_ops._sparse_cross_hashed( [ self._sparse_tensor([['batch1-FC1-F1']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1']]) ], hash_key=sparse_ops._DEFAULT_HASH_KEY + 1) # Check actual hashed output to prevent unintentional hashing changes. expected_out = self._sparse_tensor([[4847552627144134031]]) with self.test_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_hashed__has_no_collision(self): """Tests that fingerprint concatenation has no collisions.""" # Although the last 10 bits of 359 and 1024+359 are identical. # As a result, all the crosses shouldn't collide. t1 = constant_op.constant([[359], [359 + 1024]]) t2 = constant_op.constant([list(range(10)), list(range(10))]) cross = sparse_ops._sparse_cross_hashed( [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1) cross_dense = sparse_ops.sparse_tensor_to_dense(cross) with session.Session(): values = cross_dense.eval() self.assertTrue(numpy.not_equal(values[0], values[1]).all())
def test_hashed_3x1x2(self): """Tests 3x1x2 permutation with hashed output.""" op = sparse_ops._sparse_cross_hashed([ self._sparse_tensor( [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']]) ], num_buckets=1000) with self.test_session() as sess: out = sess.run(op) self.assertEqual(6, len(out.values)) self.assertAllEqual([[0, i] for i in range(6)], out.indices) self.assertTrue(all(x < 1000 and x >= 0 for x in out.values)) all_values_are_different = len(out.values) == len(set(out.values)) self.assertTrue(all_values_are_different)
def build_columns(self, items, emb_dict): wide_columns = [] deep_columns = [] print("===") print(self.FEATURE_CONF) # input assignment for fea in self.PARSED_COLUMNS: fea_config = self._check_config(self.FEATURE_CONF, fea.split("$")[0]) # if fea is real vec, split by '$' if fea_config is None: print("[build_estimator] incorrect input %s: no feature_conf." % (str(fea))) continue feature_type = self._check_config(fea_config, 'feature_type', legal_list=['sparse', 'multi_sparse', 'real']) model_type = self._check_config(fea_config, 'model_type', legal_list=['wide', 'deep']) layer = None # assign input type. if model_type == 'wide': if feature_type == 'sparse': # wide 必须hash feature_sparse = self._check_config(fea_config, 'feature_sparse', legal_list=['hash'], default='hash') bucket_size = self._check_config(fea_config, 'bucket_size', default=1024) print("[build_estimator] add sparse_column_with_hash_bucket, fea = %s, hash_bucket_size = %d" % ( str(fea), bucket_size)) _, onehot_emb = self.get_onehot(items[fea], bucket_size) wide_columns.append( tf.reshape(onehot_emb, [1, bucket_size]) ) elif feature_type == 'multi_sparse': # wide 必须hash feature_sparse = self._check_config(fea_config, 'feature_sparse', legal_list=['hash'], default='hash') bucket_size = self._check_config(fea_config, 'bucket_size', default=1024) print("[build_estimator] add multi_sparse_column_with_hash_bucket, fea = %s, hash_bucket_size = %d" % ( str(fea), bucket_size)) wide_columns.append( self._process_list_column(tf.reshape(items[fea], [-1]), bucket_size) ) else: raise ("wide build column error!") elif model_type == 'deep': model_feed_type = self._check_config(fea_config, 'model_feed_type', legal_list=['embedding', 'onehot', 'real'], default='onehot') bucket_size = self._check_config(fea_config, 'bucket_size', default=1024) if model_feed_type == 'embedding': dimension = self._check_config(fea_config, 'dimension', default=32) print("[build_estimator] add embedding_column, fea = %s, hash_bucket_size = %d, dimension = %d" % ( str(fea), bucket_size, dimension)) onehot_value, onehot_emb = self.get_onehot(items[fea], bucket_size) lookup_emb = tf.nn.embedding_lookup(emb_dict[fea], onehot_value) deep_columns.append( tf.reshape(lookup_emb, [1, dimension]) ) elif model_feed_type == 'onehot': print("[build_estimator] add one_hot_column, fea = %s, hash_bucket_size = %d" % (str(fea), bucket_size)) onehot_value, onehot_emb = self.get_onehot(items[fea], bucket_size) deep_columns.append( tf.reshape(onehot_emb, [1, bucket_size]) ) elif model_feed_type == 'real': print("[build_estimator] add real_valued_column, fea = %s" % (str(fea))) deep_columns.append( tf.reshape(items[fea], [1, 1]) ) else: print("[build_estimator] incorrect input %s: illegal model_feed_type." % (str(fea))) else: print("[build_estimator] incorrect input %s: illegal model_type" % (str(fea))) # crossing cross_sparse = _sparse_cross_hashed(wide_columns, num_buckets=3000000) fix_cross_sparse = tf.SparseTensor( indices=cross_sparse.indices, values=cross_sparse.values, dense_shape=cross_sparse.dense_shape) deep_line = tf.concat(deep_columns, axis=1) deep_line = tf.reshape(deep_line, shape=[-1]) wide_line = fix_cross_sparse # print some info for the columns registered to different types print("[build_estimator] wide columns: %d" % (len(wide_columns))) # print(wide_columns) print("[build estimator] deep columns: %d" % (len(deep_columns))) # print(deep_columns) print("[build estimator] cross_sparse:") # print(cross_sparse) # crossed wide columns # return wide_line, deep_line print("[deep_line]") # print(deep_line) return deep_line