def test_shared_embedding_column_with_hash_bucket(): color_data = { 'color': [[2, 2], [5, 5], [0, -1], [0, 0]], 'color2': [[2], [5], [-1], [0]] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) color_column2 = feature_column.categorical_column_with_hash_bucket( 'color2', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def create_feature_columns(): # user feature phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("feature columns:", my_feature_columns) return my_feature_columns
def create_features_columns(self): userID = fc.embedding_column( fc.categorical_column_with_hash_bucket( key="userID", hash_bucket_size=FLAGS.user_did_size, dtype=tf.int64), dimension=FLAGS.embed_size, initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)) itemID = fc.embedding_column( fc.categorical_column_with_hash_bucket( key="itemID", hash_bucket_size=FLAGS.item_uuid_size, dtype=tf.int64), dimension=FLAGS.embed_size, initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)) self.all_columns["userID"] = userID self.all_columns["itemID"] = itemID self.feature_spec = tf.feature_column.make_parse_example_spec( self.all_columns.values()) return self
def get_feature_columns(): ''' 获取特征列 ''' dnn_feature_columns = list() linear_feature_columns = list() # DNN features user_cate = fc.categorical_column_with_hash_bucket("userid", 40000, tf.int64) feed_cate = fc.categorical_column_with_hash_bucket("feedid", 240000, tf.int64) author_cate = fc.categorical_column_with_hash_bucket("authorid", 40000, tf.int64) bgm_singer_cate = fc.categorical_column_with_hash_bucket("bgm_singer_id", 40000, tf.int64) bgm_song_cate = fc.categorical_column_with_hash_bucket("bgm_song_id", 60000, tf.int64) user_embedding = fc.embedding_column(user_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) feed_embedding = fc.embedding_column(feed_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) author_embedding = fc.embedding_column(author_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) bgm_singer_embedding = fc.embedding_column(bgm_singer_cate, FLAGS.embed_dim) bgm_song_embedding = fc.embedding_column(bgm_song_cate, FLAGS.embed_dim) dnn_feature_columns.append(user_embedding) dnn_feature_columns.append(feed_embedding) dnn_feature_columns.append(author_embedding) dnn_feature_columns.append(bgm_singer_embedding) dnn_feature_columns.append(bgm_song_embedding) # Linear features video_seconds = fc.numeric_column("videoplayseconds", default_value=0.0) device = fc.numeric_column("device", default_value=0.0) linear_feature_columns.append(video_seconds) linear_feature_columns.append(device) # 行为统计特征 for b in FEA_COLUMN_LIST: feed_b = fc.numeric_column(b+"sum", default_value=0.0) linear_feature_columns.append(feed_b) user_b = fc.numeric_column(b+"sum_user", default_value=0.0) linear_feature_columns.append(user_b) return dnn_feature_columns, linear_feature_columns
def make_feature_layer(self): feature_cols = [] for col in self.numeric_column: feature_cols.append(feature_column.numeric_column(col)) for col in self.categorical_column_num: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=int(3 * unique_count)), dimension=1) feature_cols.append(feat_cols) for col in self.categorical_column_text: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=int(3 * unique_count)), dimension=1) feature_cols.append(feat_cols) for col in self.bool_column: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=3), dimension=1) feature_cols.append(feat_cols) return feature_cols
def test_shared_embedding_column_with_hash_bucket(): # 1. Input features color_data = { 'range': [[2, 2], [5, 5], [0, -1], [0, 0]], 'id': [[2], [5], [-1], [0]] } builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'range', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) # 2. Feature columns (Sparse) color_column2 = feature_column.categorical_column_with_hash_bucket( 'id', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 2. Feature columns (Dense) color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def create_feature_columns(note_emb_size=10, note_user_emb_size=6): # 先创建分类列 creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators", hash_bucket_size=2000, dtype=tf.string) note_ids = fc.categorical_column_with_hash_bucket("last_note_ids", 20000, dtype=tf.int64) creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000) note_id = fc.categorical_column_with_hash_bucket("note_id", 20000, dtype=tf.int64) video_duration = fc.numeric_column("note_video_duration") video_duration_bucket = fc.bucketized_column(source_column=video_duration, boundaries=[5, 10, 30, 60]) note_emb = fc.shared_embedding_columns([note_ids, note_id], note_emb_size, combiner='sum') creator_emb = fc.shared_embedding_columns([creator_ids, creator_id], note_user_emb_size, combiner='sum') my_feature_columns = note_emb + creator_emb + [video_duration_bucket] print("*" * 100) print("feature columns:") for i in my_feature_columns: print(i) print("*" * 100) return my_feature_columns
def build_ama_ele_columns(): feature_columns = [ fc.embedding_column(fc.categorical_column_with_hash_bucket( 'user_id', hash_bucket_size=200000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'item_id', hash_bucket_size=1000), dimension=32), # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq', hash_bucket_size=200000),dimension=32), # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq_cate',hash_bucket_size=200000),dimension=32), ] feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def build_model_columns(embedding_size): linear_feature_columns = [] embedding_feature_columns = [] u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64) u_id_embedded = feature_column.embedding_column(u_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(u_id)) embedding_feature_columns.append(u_id_embedded) i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64) i_id_embedded = feature_column.embedding_column(i_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(i_id)) embedding_feature_columns.append(i_id_embedded) return linear_feature_columns, embedding_feature_columns
def create_features_columns(self): # 向量类特征 user_vector = fc.numeric_column(key="user_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) item_vector = fc.numeric_column(key="item_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) # 分桶类特征 age = fc.numeric_column(key="age", shape=(1, ), default_value=[0], dtype=tf.int64) age = fc.bucketized_column( input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80]) age = fc.embedding_column(age, dimension=32, combiner='mean') # 分类特征 city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0) city = fc.embedding_column(city, dimension=32, combiner='mean') # hash特征 device_id = fc.categorical_column_with_hash_bucket( key="device_id", hash_bucket_size=1000000, dtype=tf.int64) device_id = fc.embedding_column(device_id, dimension=32, combiner='mean') item_id = fc.categorical_column_with_hash_bucket( key="item_id", hash_bucket_size=10000, dtype=tf.int64) item_id = fc.embedding_column(device_id, dimension=32, combiner='mean') self.user_columns["user_vector"] = user_vector self.user_columns["age"] = age self.user_columns["city"] = city self.user_columns["device_id"] = device_id self.item_columns["item_vector"] = item_vector self.item_columns["item_id"] = item_id self.feature_spec = tf.feature_column.make_parse_example_spec( self.user_columns.values() + self.item_columns.values()) return self
def test_categorical_column_with_hash_bucket(): color_data = { 'color': [['R', 'G'], ['G', 'A'], ['B', 'G'], ['A', 'G'], ['A', '']] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7) color_column_tensor = color_column._get_sparse_tensors(builder).id_tensor with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor])) #print(session.run([color_column_tensor.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): """Generate a feature column from a categorical string data set Parameters ---------- name : str Name of categorical columns data : np.ndarray | list String data array vocab_threshold : int Number of unique entries in the data array below which this will use a vocabulary list, above which a hash bucket will be used. bucket_size : int Hash bucket size. Returns ------- f_col : IndicatorColumn Categorical feature column. """ n_unique = len(set(data)) if n_unique < vocab_threshold: f_col = feature_column.categorical_column_with_vocabulary_list( name, list(set(data))) else: f_col = feature_column.categorical_column_with_hash_bucket( name, bucket_size) f_col = feature_column.indicator_column(f_col) return f_col
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string): f = feature_column.categorical_column_with_hash_bucket( feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype) f1 = feature_column.embedding_column(f, 4) feature_layer = tf.keras.layers.DenseFeatures( [f1]) # 把feature_column对象转为tensor return feature_layer
def _build_census_wide_columns(numeric_range=None): base_columns, cross_columns = [], [] for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']: base_columns.append( fc.indicator_column( fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000))) for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']: base_columns.append( fc.bucketized_column(fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000)))) for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']: cross_columns.append( fc.indicator_column( fc.crossed_column([col[0], col[1]], hash_bucket_size=10000))) feature_columns = base_columns + cross_columns feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string): f = feature_column.categorical_column_with_hash_bucket( feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype) f1 = feature_column.embedding_column(f, 4) # 可以被训练的,就需要在main里面对变量进行初始化 # 把feature_column对象转为 dense tensor,所有的feature_column都必须做这步 feature_layer = tf.keras.layers.DenseFeatures([f1]) # 继承自Layer基类,会自动实现call方法 return feature_layer
def test_categorical_column_with_hash_bucket(): #源数据 color_data = {'color': [[2], [5], [-1], [0]]} # 4行样本 shape=[4,1] builder = _LazyBuilder(color_data) # categorical_column color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) # tensor color_column_tensor = color_column._get_sparse_tensors(builder) #稀疏表示 with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) #input_layer连接数据源和声明的column生成新的tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_weighted_categorical_column(): f_in = open("new.tf.rec.base64", "r") for line in f_in: try: b = base64.b64decode(line.strip()) except Exception as e: sys.stderr.write(e) continue exa = example_pb2.Example() print("before parse proto...........") try: exa.ParseFromString(b) except Exception as e: sys.stderr.write(e.str()) continue print("after parse proto........") #print (exa) u_pocs_l1_norm = feature_column.categorical_column_with_hash_bucket( "u_pocs_l1_norm", 3000) u_pocs_l1_norm_weighted = feature_column.weighted_categorical_column( u_pocs_l1_norm, weight_feature_key='u_pocs_l1_norm_val') feature_columns = [u_pocs_l1_norm_weighted] features = tf.parse_single_example( b, tf.feature_column.make_parse_example_spec(feature_columns)) print(features["u_pocs_l1_norm"]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run(features["u_pocs_l1_norm"])) break
def test_categorical_column_with_hash_bucket(): # 1. Input features color_data = {'color': [[2], [5], [-1], [0]]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) # Convert the Categorical Column to Dense Column color_column_identity = feature_column.indicator_column(color_column) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identity]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def hash_embedding(self, hash_bucket, embedding_dim, name): cate_feature = feature_column.categorical_column_with_hash_bucket( name, hash_bucket, dtype=tf.string) emb_col = feature_column.embedding_column(cate_feature, dimension=embedding_dim, combiner='mean') ind_col = feature_column.indicator_column(cate_feature) return emb_col, ind_col
def hashed_columns(self, hashed_columns_dict): ### Independance for col_name, bucket_size in hashed_columns_dict.items(): hashedCol = feature_column.categorical_column_with_hash_bucket( col_name, hash_bucket_size=bucket_size) hashedFeature = feature_column.indicator_column(hashedCol) self.sparse_columns[col_name] = hashedFeature return hashedFeature
def practise(): fx = {'x': [['a', 'a'], ['b', 'c'], ['c', 'e'], ['d', ''], ['e', 'f']]} fc = feature_column.categorical_column_with_hash_bucket('x', 5) fic = feature_column.indicator_column(fc) t2 = fc._get_sparse_tensors(_LazyBuilder(fx)).id_tensor tsor = feature_column.input_layer(fx, fic) with tf.Session() as sess: print(sess.run(t2)) print(sess.run(tsor))
def create_feature_columns(note_emb_size=10, note_user_emb_size=6): # 先创建分类列 creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators", hash_bucket_size=2000, dtype=tf.string) note_ids = fc.categorical_column_with_hash_bucket("last_note_ids", 20000, dtype=tf.int64) creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000) note_id = fc.categorical_column_with_hash_bucket("note_id", 20000, dtype=tf.int64) video_duration = fc.numeric_column("note_video_duration") video_duration_bucket = fc.bucketized_column(source_column=video_duration, boundaries=[5, 10, 30, 60]) note_emb = fc.shared_embedding_columns([note_ids, note_id], note_emb_size, combiner='sum') creator_emb = fc.shared_embedding_columns([creator_ids, creator_id], note_user_emb_size, combiner='sum') # phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) # phoneBrand = fc.embedding_column(phoneBrandId, 20) # phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) # phoneResolution = fc.embedding_column(phoneResolutionId, 10) # phoneOs = fc.indicator_column(fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) # city_id = fc.categorical_column_with_hash_bucket("city", 700) # city = fc.embedding_column(city_id, 16) # hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) my_feature_columns = note_emb + creator_emb + [video_duration_bucket] print("*" * 100) print("feature columns:") for i in my_feature_columns: print(i) print("*" * 100) return my_feature_columns
def embeddings_columns(self, coldim_dict): for col_name, dimension in coldim_dict.items(): #embCol = feature_column.categorical_column_with_vocabulary_list(col_name, colunique ) bucket_size = dimension * dimension embCol = feature_column.categorical_column_with_hash_bucket( col_name, hash_bucket_size=bucket_size) embedding = feature_column.embedding_column(embCol, dimension=dimension) self.real_columns[col_name] = embedding return embedding
def build_census_emb_columns(): n_range = get_census_numeric_feat_range() feature_columns = [ # numeric feature embedding fc.embedding_column(fc.bucketized_column( fc.numeric_column('age'), boundaries=list( np.linspace(n_range['age'][0], n_range['age'][1], 1000))), dimension=32), fc.embedding_column(fc.bucketized_column( fc.numeric_column('education_num'), boundaries=list( np.linspace(n_range['education_num'][0], n_range['education_num'][1], 1000))), dimension=32), fc.embedding_column(fc.bucketized_column( fc.numeric_column('capital_gain'), boundaries=list( np.linspace(n_range['capital_gain'][0], n_range['capital_gain'][1], 1000))), dimension=32), fc.embedding_column(fc.bucketized_column( fc.numeric_column('capital_loss'), boundaries=list( np.linspace(n_range['capital_loss'][0], n_range['capital_loss'][1], 1000))), dimension=32), fc.embedding_column(fc.bucketized_column( fc.numeric_column('hours_per_week'), boundaries=list( np.linspace(n_range['hours_per_week'][0], n_range['hours_per_week'][1], 1000))), dimension=32), # category feature embedding fc.embedding_column(fc.categorical_column_with_hash_bucket( 'gender', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'education', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'marital_status', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'relationship', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'workclass', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'native_country', hash_bucket_size=1000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'occupation', hash_bucket_size=1000), dimension=32) ] feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def create_linear_feature_columns(): phoneBrand = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolution = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0) matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.categorical_column_with_identity("matchType", 9, default_value=0) position = fc.categorical_column_with_identity("position", 201, default_value=200) triggerNum = fc.categorical_column_with_identity("triggerNum", 51, default_value=50) triggerRank = fc.categorical_column_with_identity("triggerRank", 51, default_value=50) sceneType = fc.categorical_column_with_identity("type", 2, default_value=0) hour = fc.categorical_column_with_identity("hour", 24, default_value=0) columns = [phoneBrand, phoneResolution, phoneOs, matchScore, popScore, brandPrefer, cate2Prefer, catePrefer, sellerPrefer, matchType, position, triggerRank, triggerNum, sceneType, hour] print("linear feature columns:", columns) return columns
def create_embedding_feature_columns(shared_embedding_dim=64): ''' describe:当我们需要对特征进行embedding共享对时候 :return: ''' # 点击category id c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) # 对clids进行加权赋值,有点像attention c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") # category id c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) # c1ids_weighted 和 c1id中都用到了category id,但是这边是保证了其在同一个embedding空间,并不是特征一致 # 此处c1id_emb会返回长度为2的列表,每个元素的是shared_embedding_dim维的tenser,总长2*shared_embedding_dim c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id], shared_embedding_dim, combiner='sum') return c1id_emb
def shared_embedding_column_with_hash_bucket(): color_data = {'color': [[2, 2], [5, 5], [0, -1], [0, 0]], # 4行样本 shape=[4,2] 'color2': [[2], [5], [-1], [0]]} # 4行样本 shape=[4,1] builder = _LazyBuilder(color_data) # categorical_column1 color_column = feature_column.categorical_column_with_hash_bucket('color', 7, dtype=tf.int32) print(color_column) # tensor1 color_column_tensor = color_column._get_sparse_tensors(builder) # categorical_column2 color_column2 = feature_column.categorical_column_with_hash_bucket('color2', 7, dtype=tf.int32) print(color_column2) # tensor2 color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('categorical_column_with_hash_bucket' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) print('not use input_layer' + '_' * 40) color_column_embed = feature_column.shared_embedding_columns([color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) print((color_column_embed)) color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('shared_embedding_columns' + '_' * 40) print(session.run(color_dense_tensor))
def __init__(self, name, params): super(MLP, self).__init__() self.model_name = name self.params = params num_features = [ feature_column.bucketized_column( feature_column.numeric_column(str(i)), boundaries=[ j / (num_bin_size[i] - 1) for j in range(num_bin_size[i] - 1) ]) for i in range(8) ] if name == "MLP_FSIW": print("using elapse feature") num_features.append(feature_column.numeric_column("elapse")) cate_features = [ feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( str(i), hash_bucket_size=cate_bin_size[i - 8]), dimension=8) for i in range(8, 17) ] all_features = num_features + cate_features self.feature_layer = tf.keras.layers.DenseFeatures(all_features) self.fc1 = layers.Dense(256, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn1 = layers.BatchNormalization() self.fc2 = layers.Dense(256, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn2 = layers.BatchNormalization() self.fc3 = layers.Dense(128, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn3 = layers.BatchNormalization() print("build model {}".format(name)) if self.model_name == "MLP_EXP_DELAY": self.fc4 = layers.Dense(2) elif self.model_name == "MLP_tn_dp": self.fc4 = layers.Dense(2) elif self.model_name in ["MLP_SIG", "MLP_FSIW"]: self.fc4 = layers.Dense(1) else: raise ValueError("model name {} not exist".format(name))
def create_user_feature_columns(): gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0)) cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0)) cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.shared_embedding_columns([city_id], 16) cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer] return cols + city
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") dtype= self.get_value_tf_type("dtype") if self.get_value_tf_type("dtype") != None else tf.string if self.parameters.has_key("hash_bucket_size"): hash_bucket_size = self.parameters.get("hash_bucket_size") else: msg = "parameters error, sparse_column_with_hash_bucket must need hash_bucket_size" logger.error(msg) raise ParametersError(msg) print("bucket output_tensor_name:",output_tensor_name) output_tensor = fc.categorical_column_with_hash_bucket( key=input_tensor_name, hash_bucket_size=hash_bucket_size, dtype=dtype ) output_tensors[output_tensor_name] = output_tensor