def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item feature pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000)) phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500)) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab", ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0)) pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid") bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid") cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid") c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id") sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid") global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] my_feature_columns += pid_embed my_feature_columns += sid_embed my_feature_columns += bid_embed my_feature_columns += cid_embed my_feature_columns += c1id_embed print("feature columns:", my_feature_columns) return my_feature_columns
def test_weighted_categorical_column(): f_in = open("new.tf.rec.base64", "r") for line in f_in: try: b = base64.b64decode(line.strip()) except Exception as e: sys.stderr.write(e) continue exa = example_pb2.Example() print("before parse proto...........") try: exa.ParseFromString(b) except Exception as e: sys.stderr.write(e.str()) continue print("after parse proto........") #print (exa) u_pocs_l1_norm = feature_column.categorical_column_with_hash_bucket( "u_pocs_l1_norm", 3000) u_pocs_l1_norm_weighted = feature_column.weighted_categorical_column( u_pocs_l1_norm, weight_feature_key='u_pocs_l1_norm_val') feature_columns = [u_pocs_l1_norm_weighted] features = tf.parse_single_example( b, tf.feature_column.make_parse_example_spec(feature_columns)) print(features["u_pocs_l1_norm"]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run(features["u_pocs_l1_norm"])) break
def test_weighted_categorical_column(): # 1. Input features color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # 2. Feature columns (Sparse) color_weight_categorical_column \ = feature_column.weighted_categorical_column(color_column, 'weight') builder = _LazyBuilder(color_data) id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight])) # 2. Feature columns (Dense) weighted_column = feature_column.indicator_column( color_weight_categorical_column) # 3. Feature tensor weighted_column_dense_tensor = feature_column.input_layer( color_data, [weighted_column]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([weighted_column_dense_tensor]))
def _add_weighted_embedding_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table weighted_column = fc.weighted_categorical_column( fc.categorical_column_with_vocabulary_list(f, vocabulary.vocab[f]), f + _WEIGHTED_SUFFIX) emb_weighted_column = fc.embedding_column(weighted_column, feature_table[f].emb_width, combiner='sqrtn') columns.append(emb_weighted_column)
def test_weighted_cate_column(): # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错 # !!! 而且weight必须是float型,输入int会报错 x_values = { 'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']], 'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]] } builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list( 'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) sparse_featcol = feature_column.weighted_categorical_column( categorical_column=sparse_id_featcol, weight_feature_key='weight') x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens] # 其中的权重是这个token出现的所有权重的总和 dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) id_sparse_value, weight_sparse_value = sess.run( [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor]) print("************************* sparse id tensor") # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4])) print(id_sparse_value) print("************************* sparse weight tensor") # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 1., 2., -3., 4., 5., 7., -8.], dtype=float32), dense_shape=array([2, 4])) print(weight_sparse_value) print("************************* dense MHE tensor") # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab] # 其中的每个数值是该token出现的所有权重的总和 # [[-2. 0. 4.] # [ 0. -3. 0.]] print(sess.run(x_dense_tensor))
def test_weighted_categorical_column(): color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 4行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') builder = _LazyBuilder(color_data) with tf.Session() as session: id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight]))
def create_interaction_feature_columns(shared_embedding_dim=60): # user embedding features phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneBrand = fc.shared_embedding_columns([phoneBrandId], shared_embedding_dim) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneResolution = fc.shared_embedding_columns([phoneResolutionId], shared_embedding_dim) bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item embedding features pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) c2id = fc.categorical_column_with_hash_bucket("cate2Id", 500, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # shared embedding pid_emb = fc.shared_embedding_columns([pids_weighted, pid], shared_embedding_dim, combiner='sum') bid_emb = fc.shared_embedding_columns([bids_weighted, bid], shared_embedding_dim, combiner='sum') cid_emb = fc.shared_embedding_columns([cids_weighted, cid], shared_embedding_dim, combiner='sum') c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id], shared_embedding_dim, combiner='sum') sid_emb = fc.shared_embedding_columns([sids_weighted, sid], shared_embedding_dim, combiner='sum') c2id_emb = fc.shared_embedding_columns([c2id], shared_embedding_dim) columns = phoneBrand columns += phoneResolution columns += pid_emb columns += sid_emb columns += bid_emb columns += cid_emb columns += c1id_emb columns += c2id_emb print("interaction feature columns:", columns) return columns
def test_weighted_categorical_feature_embedding(): color_data = { 'color': [['R', 'R'], ['G', 'G'], ['B', 'B'], ['G', 'R'], ['G', 'B'], ['B', 'R']], 'weight': [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.2], [0.4, 0.3], [0.4, 0.6]] } # 6行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7, combiner="sum") color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') color_embeding_weighted = feature_column.embedding_column( color_weight_categorical_column, 7, combiner="sum") color_embeding_dense_tensor_2 = feature_column.input_layer( color_data, [color_embeding_weighted]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) color_weighted_tensor = color_weight_categorical_column._get_sparse_tensors( builder) ## is a pair (id_tensor, weight_tensor) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print("color column weight:") print(color_column_tensor.weight_tensor) print("color column weighted categorical, weight:") print(session.run([color_weighted_tensor.id_tensor])) print(session.run([color_weighted_tensor.weight_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor])) print('embeding weighted categorical column') print(session.run([color_embeding_dense_tensor_2]))
def create_embedding_feature_columns(shared_embedding_dim=64): ''' describe:当我们需要对特征进行embedding共享对时候 :return: ''' # 点击category id c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) # 对clids进行加权赋值,有点像attention c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") # category id c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) # c1ids_weighted 和 c1id中都用到了category id,但是这边是保证了其在同一个embedding空间,并不是特征一致 # 此处c1id_emb会返回长度为2的列表,每个元素的是shared_embedding_dim维的tenser,总长2*shared_embedding_dim c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id], shared_embedding_dim, combiner='sum') return c1id_emb
def __init__(self, name, classifier=None, delimiter="|", sep="@", **kwargs): """微博标签类 :param name: :param index: :param dtype: :param delimiter: char, 标签间的分隔符,默认是@ :param sep: char, 标签内,名字和权重的分隔符,默认是| :param kwargs: """ super(TagColumn, self).__init__(name, classifier) assert isinstance(delimiter, str) and len(delimiter) == 1, ( "{}: delimiter must be char," "while get: {}".format(name, delimiter)) self.delimiter = delimiter assert isinstance(sep, str) and len(sep) == 1, ( "{}: sep must be char, while get: {}".format(name, sep)) self.sep = sep self._tag_name = name + "_tag" self._weight_name = name + "_weight" if "num_buckets" in kwargs: raise ValueError("num_buckets is not supported in TagColumn.") self._input_column = _get_category_column_from_dict( self._tag_name, kwargs) self._weight_input_column = fc.weighted_categorical_column( self._input_column, self._weight_name) self._feature_column = fc.indicator_column(self._weight_input_column)
import numpy as np import tensorflow as tf import tensorflow.feature_column as fc categorical_column = fc.categorical_column_with_hash_bucket( key='query_plan_ops', hash_bucket_size=200) weighted_column = fc.weighted_categorical_column( categorical_column=categorical_column, weight_feature_key='op_freq') #feature_env = fc.numeric_column('env', shape=(1,4), dtype=tf.int64) #feature_label = fc.numeric_column('label', shape=(1,), dtype=tf.float32) #env_columns = tf.FixedLenFeature([1, 4], tf.int64) #exec_time = tf.FixedLenFeature([], tf.float32) cpu_column = fc.numeric_column('cpu', (1, 1)) env_columns = fc.numeric_column('env', (1, 3)) total_ops = fc.numeric_column('total_ops') #exec_time = fc.numeric_column('label') cat_table_size = fc.categorical_column_with_hash_bucket(key='table_size', hash_bucket_size=20) weighted_column_table = fc.weighted_categorical_column( categorical_column=cat_table_size, weight_feature_key='table_size_weight') feature_columns = [ cpu_column, env_columns, weighted_column, total_ops, weighted_column_table ] fmap = fc.make_parse_example_spec(feature_columns) #fmap['env'] = env_columns #fmap['label'] = exec_time #print(fmap)
dtype=tf.int64) pid_embed = fc.shared_embedding_columns([pids, pid], 100, combiner='sum', shared_embedding_collection_name="pid") """ 那么如何实现weighted sum pooling操作呢?答案就是使用weighted_categorical_column函数。我们必须在构建样本时添加一个额外的权重特征, 权重特征表示行为序列中每个产品的权重,因此权重特征是一个与行为序列平行的列表(向量),两者的维度必须相同。 另外,如果行为序列中有填充的默认值-1,那么权重特征中这些默认值对应的权重必须为0。代码示例如下: """ from tensorflow import feature_column as fc # user field pids = fc.categorical_column_with_hash_bucket("behaviorPids", 10240, dtype=tf.int64) pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item field pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 100, combiner='sum', shared_embedding_collection_name="pid") """ 模型函数 Base模型的其他组件就不过多介绍了,模型函数的代码如下: """ def my_model(features, labels, mode, params):
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10000, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10000, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10000, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 500000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") pid_embed = fc.embedding_column(pids_weighted, 64) bid_embed = fc.embedding_column(bids_weighted, 32) cid_embed = fc.embedding_column(cids_weighted, 48) c1id_embed = fc.embedding_column(c1ids_weighted, 10) sid_embed = fc.embedding_column(sids_weighted, 32) phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolutionId = fc.categorical_column_with_hash_bucket( "phoneResolution", 500) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) gender = fc.indicator_column( fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column( fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column( fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column( fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column( fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column( fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column( fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.embedding_column(city_id, 16) userType = fc.indicator_column( fc.categorical_column_with_identity("user_type", 6, default_value=0)) hour = fc.indicator_column( fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [ userType, hour, gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, phoneBrand, phoneResolution, phoneOs, pid_embed, sid_embed, bid_embed, cid_embed, c1id_embed, city ] print("feature columns:", my_feature_columns) return my_feature_columns
import numpy as np import tensorflow as tf import tensorflow.feature_column as fc categorical_column = fc.categorical_column_with_hash_bucket( key='query_plan_ops', hash_bucket_size=20) weighted_column = fc.weighted_categorical_column( categorical_column=categorical_column, weight_feature_key='op_freq') #feature_env = fc.numeric_column('env', shape=(1,4), dtype=tf.int64) #feature_label = fc.numeric_column('label', shape=(1,), dtype=tf.float32) #env_columns = tf.FixedLenFeature([1, 4], tf.int64) #exec_time = tf.FixedLenFeature([], tf.float32) cpu = fc.numeric_column('cpu') env_columns = fc.numeric_column('env', (1, 3)) total_ops = fc.numeric_column('total_ops') exec_time = fc.numeric_column('label') feature_columns = [cpu, env_columns, weighted_column, total_ops, exec_time] fmap = fc.make_parse_example_spec(feature_columns) #fmap['env'] = env_columns #fmap['label'] = exec_time #print(fmap) #https://jhui.github.io/2017/11/21/TensorFlow-Importing-data/ def parser(serialized_example): """Parses a single tf.Example into image and label tensors.""" features = tf.parse_single_example( serialized_example, # features={