def test_crossed_column(): """ crossed column测试 """ #源数据 featrues = { 'price': [['A'], ['B'], ['C']], # 0,1,2 'color': [['R'], ['G'], ['B']] # 0,1,2 } # categorical_column price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) #crossed_column 产生稀疏表示 p_x_c = feature_column.crossed_column([price, color], 16) # 稠密表示 p_x_c_identy = feature_column.indicator_column(p_x_c) # crossed_column 连接 源数据 p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([p_x_c_identy_dense_tensor]))
def get_item_feature_columns(business_vocab_list, item_type_dict): items_feature_columns = [] bucketized_boundary = {'stars': [2.5, 4]} embedding_size = {"categories": 8, "city": 4} for k, v in business_vocab_list.items(): if k in ['review_count']: col = numeric_column(k, default_value=0, dtype=item_type_dict[k]) elif k in ['stars']: col = bucketized_column( numeric_column(k, default_value=0, dtype=item_type_dict[k]), bucketized_boundary[k]) elif k in ['categories', 'city']: col = embedding_column(categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k]), dimension=embedding_size[k]) else: col = indicator_column( categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k])) items_feature_columns.append(col) return items_feature_columns
def create_feature_layer() -> tf.keras.layers.DenseFeatures: # Feature column for height feature_height = feature_column.numeric_column("Groesse") # Feature column for weight feature_weight = feature_column.numeric_column("Gewicht") # Feature column for age feature_age = feature_column.numeric_column("Alter") # Category column for gender feature_gender = feature_column.categorical_column_with_vocabulary_list( 'Geschlecht', ['w', 'm']) feature_gender_one_hot = feature_column.indicator_column(feature_gender) # Category column for activities feature_activities = feature_column.categorical_column_with_vocabulary_list( 'Betaetigung', ['keinSport', 'Kraftsport', 'Ausdauersport']) feature_activities_one_hot = feature_column.indicator_column( feature_activities) feature_columns = [ feature_height, feature_weight, feature_age, feature_gender_one_hot, feature_activities_one_hot ] return tf.keras.layers.DenseFeatures(feature_columns)
def _get_tf_feature_cols(dataframe: pd.DataFrame): feature_columns = [] # numeric cols for header in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(header)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column( age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column( categorical_column) feature_columns.append(indicator_column) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=8) feature_columns.append(breed1_embedding) return feature_columns
def define_feature_columns(dataframe): print("Defining feature columns...") feature_columns = [] # Create embedding column for name IDs name_id = feature_column.categorical_column_with_vocabulary_list( 'nconst', dataframe.nconst.unique()) # Dimension set to 30 (approximately fourth root of the number of unique name IDs) name_id_embedding = feature_column.embedding_column(name_id, dimension=30) feature_columns.append(name_id_embedding) # Create indicator columns for category and genres indicator_column_names = ['category', 'genres'] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) # Create bucketized column for startYear (a.k.a. release date) start_year_numeric = feature_column.numeric_column('startYear') start_year_bucket = feature_column.bucketized_column( start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015]) feature_columns.append(start_year_bucket) print("Feature columns defined") return feature_columns
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item feature pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000)) phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500)) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab", ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0)) pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid") bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid") cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid") c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id") sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid") global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] my_feature_columns += pid_embed my_feature_columns += sid_embed my_feature_columns += bid_embed my_feature_columns += cid_embed my_feature_columns += c1id_embed print("feature columns:", my_feature_columns) return my_feature_columns
def add_feature_columns(train_df): feature_columns = [] feature_columns.append(tf.feature_column.numeric_column('day_of_year')) feature_columns.append(tf.feature_column.numeric_column('hour_of_day')) # boundaries_day_of_year = list(np.arange(int(min(train_df['day_of_year'])), int(max(train_df['day_of_year'])), 1)) # bucketized_day_of_year = tf.feature_column.bucketized_column(tf.feature_column.numeric_column("day_of_year"), boundaries_day_of_year) # boundaries_hour = list(np.arange(int(min(train_df['hour'])), int(max(train_df['hour'])), 1)) # bucketized_hour = tf.feature_column.bucketized_column(tf.feature_column.numeric_column("hour"), boundaries_hour) # latitude_x_longitude = tf.feature_column.crossed_column([bucketized_day_of_year, bucketized_hour], hash_bucket_size=2200) # crossed_feature = tf.feature_column.indicator_column(latitude_x_longitude) # feature_columns.append(crossed_feature)y wind = tf.feature_column.numeric_column("wind_speed") feature_columns.append(wind) wind = tf.feature_column.numeric_column("wind_direction") feature_columns.append(wind) humidity = tf.feature_column.numeric_column("humidity") feature_columns.append(humidity) temperature = tf.feature_column.numeric_column("temperature") feature_columns.append(temperature) temperature = tf.feature_column.numeric_column("increased_traffic") feature_columns.append(temperature) print(train_df['weather_code'].unique()) weather_current = tf.feature_column.indicator_column( feature_column.categorical_column_with_vocabulary_list( "weather_code", train_df['weather_code'].unique())) feature_columns.append(weather_current) print(train_df['past_weather_code'].unique()) weather_past = tf.feature_column.indicator_column( feature_column.categorical_column_with_vocabulary_list( "past_weather_code", train_df['past_weather_code'].unique())) feature_columns.append(weather_past) # pm10_last = tf.feature_column.numeric_column("pm10_last") # feature_columns.append(pm10_last) # # pm10_last = tf.feature_column.numeric_column("pm25_last") # feature_columns.append(pm10_last) # # pm10_last2 = tf.feature_column.numeric_column("pm10_last2") # feature_columns.append(pm10_last2) # # pm25_last2 = tf.feature_column.numeric_column("pm25_last2") # feature_columns.append(pm25_last2) return feature_columns
def get_feature_columns(dataframe): """Creates feature columns from pd.DataFrame.""" feature_columns = [] feature_layer_inputs = {} # numeric cols for col_name in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(col_name)) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name, dtype=tf.string) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=16) feature_columns.append(breed1_embedding) feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ), name='Breed1', dtype=tf.string) # crossed columns animal_type = feature_column.categorical_column_with_vocabulary_list( 'Type', ['Cat', 'Dog']) feature_columns.append(feature_column.indicator_column(animal_type)) age_type_feature = feature_column.crossed_column( [age_buckets, animal_type], hash_bucket_size=100) feature_columns.append(feature_column.indicator_column(age_type_feature)) feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ), name='Type', dtype=tf.string) return feature_columns, feature_layer_inputs
def create_feature_columns(train_data): n_users = train_data.user.nunique() users = fc.categorical_column_with_vocabulary_list("user", np.arange(n_users), default_value=-1, dtype=tf.int64) gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"]) age = fc.categorical_column_with_vocabulary_list("age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64) occupation = fc.categorical_column_with_vocabulary_list("occupation", np.arange(21), dtype=tf.int64) all_feature_cols = [fc.embedding_column(users, 32), fc.indicator_column(gender), fc.embedding_column(age, 32), fc.embedding_column(occupation, 32)] return all_feature_cols
def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): """Generate a feature column from a categorical string data set Parameters ---------- name : str Name of categorical columns data : np.ndarray | list String data array vocab_threshold : int Number of unique entries in the data array below which this will use a vocabulary list, above which a hash bucket will be used. bucket_size : int Hash bucket size. Returns ------- f_col : IndicatorColumn Categorical feature column. """ n_unique = len(set(data)) if n_unique < vocab_threshold: f_col = feature_column.categorical_column_with_vocabulary_list( name, list(set(data))) else: f_col = feature_column.categorical_column_with_hash_bucket( name, bucket_size) f_col = feature_column.indicator_column(f_col) return f_col
def test_weighted_categorical_column(): # 1. Input features color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # 2. Feature columns (Sparse) color_weight_categorical_column \ = feature_column.weighted_categorical_column(color_column, 'weight') builder = _LazyBuilder(color_data) id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight])) # 2. Feature columns (Dense) weighted_column = feature_column.indicator_column( color_weight_categorical_column) # 3. Feature tensor weighted_column_dense_tensor = feature_column.input_layer( color_data, [weighted_column]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([weighted_column_dense_tensor]))
def test_embedding(): tf.set_random_seed(1) # 1. Input features color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) color_embedding = feature_column.embedding_column(color_column, 4, combiner='sum') # 3. Feature tensor color_embedding_dense_tensor = feature_column.input_layer( color_data, [color_embedding]) with tf.Session() as session: # Embedding needs variables (weights) to do the embedding session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embedding' + '_' * 40) print(session.run([color_embedding_dense_tensor]))
def test_embedding(): tf.set_random_seed(1) #源数据 color_data = { 'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) # categorical_column 要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # tensor 数据源 将数据源表达成tensor color_column_tensor = color_column._get_sparse_tensors(builder) #获取embedding_column; 第一个参数是:categorical_column; 第二个参数是维度 color_embedding_column = feature_column.embedding_column(color_column, 4, combiner='sum') # 转化为tensor input_layer(数据源,column) 连接起数据源和embedding_column color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embedding_column]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print('embeding' + '_' * 40) print(session.run([color_embeding_dense_tensor]))
def make_columns(): user_id = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'user_id', vocabulary_file='user_id', dtype=tf.string, num_oov_buckets=4), dimension=6) partner_ids = fc.categorical_column_with_vocabulary_file( 'reserve_partner_car_type_id', vocabulary_file='partner_car_type_id', dtype=tf.string, num_oov_buckets=1) partner_ids_embedding = fc.embedding_column(partner_ids, dimension=3) # partner_ids_embedding = fc.indicator_column(partner_ids) dayofweek = fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'dayofweek', [str(d) for d in range(0, 8)], dtype=tf.string, num_oov_buckets=1), dimension=3) timeSlice = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'timeSlice', vocabulary_file='timeSlice', dtype=tf.string, num_oov_buckets=1), dimension=3) sHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'sHexID', vocabulary_file='sHexID', num_oov_buckets=1), dimension=6) eHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'eHexID', vocabulary_file='eHexID', num_oov_buckets=1), dimension=6) order_columns = [ fc.numeric_column('dist') ] user_columns = [fc.numeric_column(c) for c in user_null_columns + user_float32_columns] spacetime_columns = [fc.numeric_column(c) for c in int64_columns + float32_columns] embedding_columns = [user_id, partner_ids_embedding, dayofweek, timeSlice, sHexID] # embedding_columns = [dayofweek] # return embedding_columns + order_columns, embedding_columns + order_columns + spacetime_columns return embedding_columns, order_columns, spacetime_columns, user_columns
def test_weighted_categorical_column(): color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 4行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') builder = _LazyBuilder(color_data) with tf.Session() as session: id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight]))
def participant_focus(part_f): participant_focus = feature_column.categorical_column_with_vocabulary_list( part_f, [ 'engineering', 'sales', 'marketing', 'management', 'financial', 'other', 'none' ]) return participant_focus
def _add_weighted_embedding_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table weighted_column = fc.weighted_categorical_column( fc.categorical_column_with_vocabulary_list(f, vocabulary.vocab[f]), f + _WEIGHTED_SUFFIX) emb_weighted_column = fc.embedding_column(weighted_column, feature_table[f].emb_width, combiner='sqrtn') columns.append(emb_weighted_column)
def test_weighted_cate_column(): # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错 # !!! 而且weight必须是float型,输入int会报错 x_values = { 'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']], 'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]] } builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list( 'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) sparse_featcol = feature_column.weighted_categorical_column( categorical_column=sparse_id_featcol, weight_feature_key='weight') x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens] # 其中的权重是这个token出现的所有权重的总和 dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) id_sparse_value, weight_sparse_value = sess.run( [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor]) print("************************* sparse id tensor") # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4])) print(id_sparse_value) print("************************* sparse weight tensor") # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 1., 2., -3., 4., 5., 7., -8.], dtype=float32), dense_shape=array([2, 4])) print(weight_sparse_value) print("************************* dense MHE tensor") # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab] # 其中的每个数值是该token出现的所有权重的总和 # [[-2. 0. 4.] # [ 0. -3. 0.]] print(sess.run(x_dense_tensor))
def build_feature_layer(): feature_columns = [] report_id = feature_column.categorical_column_with_vocabulary_list('report_id', [1, 2, 3, 4, 5]) report_id_one_hot = feature_column.indicator_column(report_id) feature_columns.append(report_id_one_hot) feature_columns.append(feature_column.numeric_column('report_params')) day_part = feature_column.categorical_column_with_vocabulary_list('day_part', [1, 2, 3]) day_part_one_hot = feature_column.indicator_column(day_part) feature_columns.append(day_part_one_hot) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") if self.parameters.has_key("vocabulary_list"): vocabulary_list = self.parameters.get("vocabulary_list") if isinstance(vocabulary_list,list): vocabulary_list = tuple(vocabulary_list) if isinstance(vocabulary_list,basestring): vocabulary_list = tuple(vocabulary_list.split(',')) else: msg = "parameters error, sparse_column_with_keys must need keys" logger.error(msg) raise ParametersError(msg) # combiner = self.parameters.get("combiner", 'sum') dtype = self.get_value_tf_type("dtype") if self.get_value_tf_type("dtype") != None else tf.string # default_value = self.parameters.get("default_value",None) num_oov_buckets = 1 output_tensors[output_tensor_name] = fc.categorical_column_with_vocabulary_list( key = input_tensor_name, vocabulary_list = vocabulary_list, dtype=dtype, # default_value=default_value, num_oov_buckets=num_oov_buckets )
def test_linear_model(): featrues = { 'price': [[1.0], [5.0], [10.0]], 'color': [['R'], ['G'], ['B']] } price_column = feature_column.numeric_column('price') color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) prediction = feature_column.linear_model(featrues, [price_column, color_column]) bias = get_linear_model_bias() price_var = get_linear_model_column_var(price_column) color_var = get_linear_model_column_var(color_column) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) sess.run(bias.assign([7.0])) sess.run(price_var.assign([[10.0]])) sess.run(color_var.assign([[2.0], [2.0], [2.0]])) predication_result = sess.run([prediction]) print(predication_result)
def create_feature_columns(): # user feature phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("feature columns:", my_feature_columns) return my_feature_columns
def get_unique_categories_and_append(key): col = df[key] arr = col.to_numpy() unique_arr = np.unique(arr) feat_col = feature_column.categorical_column_with_vocabulary_list(key, unique_arr) one_hot = feature_column.indicator_column(feat_col) feature_columns.append(one_hot)
def test_multi_value_embedding(): color_data = { 'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'], ['B', 'R']] } color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7) color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor]))
def test_categorical_column_with_vocabulary_list(): color_data = { 'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def build_feature_columns(): age = feature_column.numeric_column('age') age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass', ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'])) fnlwgt = feature_column.numeric_column('fnlwgt') education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education', ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'])) education_num = feature_column.numeric_column('education_num') marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status', ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'])) occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation', ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'])) relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship', ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'])) race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'])) gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male'])) capital_gain = feature_column.numeric_column('capital_gain') capital_loss = feature_column.numeric_column('capital_loss') hours_per_week = feature_column.numeric_column('hours_per_week') native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country', ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'])) wide = [age, workclass] deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country] race_gender = feature_column.indicator_column(feature_column.crossed_column([ feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']), feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10)) wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender] deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country] return (wide, deep)
def data_preprocessing(self): """ batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다. train_ds = self.df_to_dataset(self.train, batch_size=batch_size) val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('전체 특성:', list(feature_batch.keys())) print('나이 특성의 배치:', feature_batch['age']) print('타깃의 배치:', label_batch) # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다. self.example_batch = next(iter(train_ds))[0] age = feature_column.numeric_column("age") self.demo(age) """ feature_columns = [] # 수치형 열 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 버킷형 열 age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 범주형 열 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 임베딩 열 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 교차 특성 열 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) self.feature_layer = layers.DenseFeatures(feature_columns) batch_size = 32 self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size) self.val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) self.test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)
def create_feature_layer(df): week = feature_column.numeric_column("Week") boundaries = [] for i in range(1, 53): boundaries.append(i) week = feature_column.bucketized_column(week, boundaries=boundaries) day = feature_column.numeric_column("Day") boundaries = [] for i in range(1, 8): boundaries.append(i) day = feature_column.bucketized_column(day, boundaries=boundaries) year = feature_column.numeric_column("Year") boundaries = [] for i in range(2013, 2017): boundaries.append(i) year = feature_column.bucketized_column(year, boundaries=boundaries) hour = feature_column.numeric_column("std_hour") boundaries = [] for i in range(0, 24): boundaries.append(i) hour = feature_column.bucketized_column(hour, boundaries=boundaries) arrival = feature_column.categorical_column_with_vocabulary_list( "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist()) airline = feature_column.categorical_column_with_vocabulary_list( "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist()) flight_no = feature_column.categorical_column_with_vocabulary_list( "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist()) arrival_one_hot = feature_column.indicator_column(arrival) airline_one_hot = feature_column.indicator_column(airline) flight_no_one_hot = feature_column.indicator_column(flight_no) arrival_length = len(pd.Series.unique(df.Arrival).tolist()) arrival_and_week = feature_column.crossed_column( [arrival, week], hash_bucket_size=(arrival_length * 52)) arrival_and_week = feature_column.indicator_column(arrival_and_week) airline_length = len(pd.Series.unique(df.Airline).tolist()) year_and_airline = feature_column.crossed_column( [year, airline], hash_bucket_size=(airline_length * 4)) year_and_airline = feature_column.indicator_column(year_and_airline) feature_columns = [] feature_columns = feature_columns + [ week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour, arrival_and_week, year, year_and_airline ] feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def _add_embedding_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table cate_col = fc.categorical_column_with_vocabulary_list( f, vocabulary.vocab[f]) column = fc.embedding_column(cate_col, feature_table[f].emb_width, combiner='sqrtn') columns.append(column)
def test_crossed_column(): featrues = { 'price': [['A', 'A'], ['B', 'D'], ['C', 'A']], 'color': [['R', 'R'], ['G', 'G'], ['B', 'B']] } price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) p_x_c = feature_column.crossed_column([price, color], 16) p_x_c_identy = feature_column.indicator_column(p_x_c) # what's this? p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([p_x_c_identy_dense_tensor]))