def _build_census_wide_columns(numeric_range=None): base_columns, cross_columns = [], [] for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']: base_columns.append( fc.indicator_column( fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000))) for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']: base_columns.append( fc.bucketized_column(fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000)))) for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']: cross_columns.append( fc.indicator_column( fc.crossed_column([col[0], col[1]], hash_bucket_size=10000))) feature_columns = base_columns + cross_columns feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def create_feature_layer() -> tf.keras.layers.DenseFeatures: # Feature column for height feature_height = feature_column.numeric_column("Groesse") # Feature column for weight feature_weight = feature_column.numeric_column("Gewicht") # Feature column for age feature_age = feature_column.numeric_column("Alter") # Category column for gender feature_gender = feature_column.categorical_column_with_vocabulary_list( 'Geschlecht', ['w', 'm']) feature_gender_one_hot = feature_column.indicator_column(feature_gender) # Category column for activities feature_activities = feature_column.categorical_column_with_vocabulary_list( 'Betaetigung', ['keinSport', 'Kraftsport', 'Ausdauersport']) feature_activities_one_hot = feature_column.indicator_column( feature_activities) feature_columns = [ feature_height, feature_weight, feature_age, feature_gender_one_hot, feature_activities_one_hot ] return tf.keras.layers.DenseFeatures(feature_columns)
def transform(inputs, num_cols, cat_cols): print("Inputs before features transformation: {}".format(inputs.keys())) # Pass-through columns transformed = inputs.copy() feature_columns = { colname: tf.feature_column.numeric_column(colname) for colname in num_cols } # Add Euclidean distance transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([ inputs['pickuplon'], inputs['pickuplat'], inputs['dropofflon'], inputs['dropofflat'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # Shift 'dayofweek' feature to a value range of 0-6 transformed['dayofweek'] = transformed['dayofweek'] - 1 # Create categorical columns (wrapped in indicator columns) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', 24)) feature_columns['dayofweek'] = fc.indicator_column( fc.categorical_column_with_identity('dayofweek', 7)) print("Transformed features: {}".format(transformed.keys())) print("Feature columns: {}".format(feature_columns.keys())) return transformed, feature_columns
def data_preprocessing(self): """ batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다. train_ds = self.df_to_dataset(self.train, batch_size=batch_size) val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('전체 특성:', list(feature_batch.keys())) print('나이 특성의 배치:', feature_batch['age']) print('타깃의 배치:', label_batch) # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다. self.example_batch = next(iter(train_ds))[0] age = feature_column.numeric_column("age") self.demo(age) """ feature_columns = [] # 수치형 열 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 버킷형 열 age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 범주형 열 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 임베딩 열 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 교차 특성 열 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) self.feature_layer = layers.DenseFeatures(feature_columns) batch_size = 32 self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size) self.val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) self.test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item feature pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000)) phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500)) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab", ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0)) pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid") bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid") cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid") c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id") sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid") global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] my_feature_columns += pid_embed my_feature_columns += sid_embed my_feature_columns += bid_embed my_feature_columns += cid_embed my_feature_columns += c1id_embed print("feature columns:", my_feature_columns) return my_feature_columns
def get_feature_columns(dataframe): """Creates feature columns from pd.DataFrame.""" feature_columns = [] feature_layer_inputs = {} # numeric cols for col_name in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(col_name)) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name, dtype=tf.string) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=16) feature_columns.append(breed1_embedding) feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ), name='Breed1', dtype=tf.string) # crossed columns animal_type = feature_column.categorical_column_with_vocabulary_list( 'Type', ['Cat', 'Dog']) feature_columns.append(feature_column.indicator_column(animal_type)) age_type_feature = feature_column.crossed_column( [age_buckets, animal_type], hash_bucket_size=100) feature_columns.append(feature_column.indicator_column(age_type_feature)) feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ), name='Type', dtype=tf.string) return feature_columns, feature_layer_inputs
def classify_data(batch_size=5): from tensorflow import feature_column from tensorflow.keras import layers from sklearn.model_selection import train_test_split URL = 'https://storage.googleapis.com/applied-dl/heart.csv' dataframe = pd.read_csv(URL) tr, te = train_test_split(dataframe, test_size=0.2) tr, va = train_test_split(tr, test_size=0.2) print(len(tr), len(va), len(te)) def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe, labels = dataframe.copy(), dataframe.pop('target') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size) return ds tr_ds = df_to_dataset(tr, batch_size=batch_size) va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size) te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size) feature_columns = [] for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column('age') age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) feature_columns.append(feature_column.indicator_column(thal)) feature_columns.append(feature_column.embedding_column(thal, dimension=8)) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) feature_columns.append(feature_column.indicator_column(crossed_feature)) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(tr_ds, validation_data=va_ds, epochs=5) loss, accuracy = model.evaluate(te_ds) print(accuracy)
def create_user_feature_columns(): gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0)) cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0)) cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.shared_embedding_columns([city_id], 16) cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer] return cols + city
def build_model_columns(embedding_size): linear_feature_columns = [] embedding_feature_columns = [] u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64) u_id_embedded = feature_column.embedding_column(u_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(u_id)) embedding_feature_columns.append(u_id_embedded) i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64) i_id_embedded = feature_column.embedding_column(i_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(i_id)) embedding_feature_columns.append(i_id_embedded) return linear_feature_columns, embedding_feature_columns
def build_features(statistics): pu_location_id = fc.categorical_column_with_identity(key='PULocationID', num_buckets=265) do_location_id = fc.categorical_column_with_identity(key='DOLocationID', num_buckets=265) day_of_week = fc.categorical_column_with_identity(key='day_of_week', num_buckets=7) weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2) speed_buckets = fc.bucketized_column( fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70]) distance_buckets = fc.bucketized_column( fc.numeric_column('trip_distance'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) duration_buckets = fc.bucketized_column( fc.numeric_column('duration'), boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]) fare_buckets = fc.bucketized_column( fc.numeric_column('fare_amount'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) passenger_buckets = fc.bucketized_column( fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9]) location = fc.crossed_column([pu_location_id, do_location_id], hash_bucket_size=1000) cross_all = fc.crossed_column([ location, speed_buckets, distance_buckets, duration_buckets, fare_buckets, passenger_buckets ], hash_bucket_size=1000) categorical_columns = [ fc.embedding_column(pu_location_id, dimension=32), fc.embedding_column(do_location_id, dimension=32), fc.indicator_column(day_of_week), fc.indicator_column(weekend) ] numeric_columns = [ custom_numeric_column('passenger_count', statistics), custom_numeric_column('trip_distance', statistics), custom_numeric_column('fare_amount', statistics), custom_numeric_column('extra', statistics), custom_numeric_column('mta_tax', statistics), custom_numeric_column('tolls_amount', statistics), custom_numeric_column('improvement_surcharge', statistics), custom_numeric_column('duration', statistics), custom_numeric_column('speed', statistics) ] dnn_feature_columns = numeric_columns + categorical_columns linear_feature_columns = [location, cross_all] return dnn_feature_columns, linear_feature_columns
def test_categorical_column_with_hash_bucket(): # 1. Input features color_data = {'color': [[2], [5], [-1], [0]]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) # Convert the Categorical Column to Dense Column color_column_identity = feature_column.indicator_column(color_column) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identity]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_categorical_column_with_vocabulary_list(): color_data = { 'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def build_feature_layer(): feature_columns = [] report_id = feature_column.categorical_column_with_vocabulary_list('report_id', [1, 2, 3, 4, 5]) report_id_one_hot = feature_column.indicator_column(report_id) feature_columns.append(report_id_one_hot) feature_columns.append(feature_column.numeric_column('report_params')) day_part = feature_column.categorical_column_with_vocabulary_list('day_part', [1, 2, 3]) day_part_one_hot = feature_column.indicator_column(day_part) feature_columns.append(day_part_one_hot) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def test_crossed_column(): """ crossed column测试 """ #源数据 featrues = { 'price': [['A'], ['B'], ['C']], # 0,1,2 'color': [['R'], ['G'], ['B']] # 0,1,2 } # categorical_column price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) #crossed_column 产生稀疏表示 p_x_c = feature_column.crossed_column([price, color], 16) # 稠密表示 p_x_c_identy = feature_column.indicator_column(p_x_c) # crossed_column 连接 源数据 p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([p_x_c_identy_dense_tensor]))
def test_weighted_cate_column(): # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错 # !!! 而且weight必须是float型,输入int会报错 x_values = { 'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']], 'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]] } builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list( 'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) sparse_featcol = feature_column.weighted_categorical_column( categorical_column=sparse_id_featcol, weight_feature_key='weight') x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens] # 其中的权重是这个token出现的所有权重的总和 dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) id_sparse_value, weight_sparse_value = sess.run( [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor]) print("************************* sparse id tensor") # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4])) print(id_sparse_value) print("************************* sparse weight tensor") # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 1., 2., -3., 4., 5., 7., -8.], dtype=float32), dense_shape=array([2, 4])) print(weight_sparse_value) print("************************* dense MHE tensor") # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab] # 其中的每个数值是该token出现的所有权重的总和 # [[-2. 0. 4.] # [ 0. -3. 0.]] print(sess.run(x_dense_tensor))
def get_unique_categories_and_append(key): col = df[key] arr = col.to_numpy() unique_arr = np.unique(arr) feat_col = feature_column.categorical_column_with_vocabulary_list(key, unique_arr) one_hot = feature_column.indicator_column(feat_col) feature_columns.append(one_hot)
def tf_inputs_dataframe(self, batch_size=1, buffer_size=1000): dataframe = read_csv( os.path.join(os.path.dirname(self.json_filename), self.data_description["csv"])) labels_name = 'ga_edd' y_name = labels_name for column_name in dataframe.columns: if column_name.startswith('_'): dataframe.pop(column_name) for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: r = max(dataframe[header]) - min(dataframe[header]) dataframe[header] = (dataframe[header] - min(dataframe[header])) / r dataframe = dataframe[(dataframe[y_name] != '.') & (notna(dataframe[y_name])) & (notnull(dataframe[y_name]))].copy() dataframe = dataframe.astype({y_name: 'int32'}) feature_columns = [] feature_names = [] num_channels = 0 for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: feature_columns.append(feature_column.numeric_column(header)) feature_names.append(header) num_channels += 1 num_identity = 2 for header in [ 'hiv', 'current_smoker', 'former_smoker', 'chronic_htn', 'preg_induced_htn', 'diabetes', 'gest_diabetes' ]: col = feature_column.categorical_column_with_identity( header, num_identity) col = feature_column.indicator_column(col) feature_columns.append(col) feature_names.append(header) num_channels += num_identity self.num_channels = num_channels feature_layer = tf.keras.layers.DenseFeatures( feature_columns=feature_columns) dataframe = dataframe.copy() labels = dataframe.pop(labels_name) dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) dataset = dataset.shuffle(buffer_size=buffer_size) dataset = dataset.batch(batch_size) dataset = dataset.map(lambda x, y: (feature_layer(x), y)) return dataset
def get_item_feature_columns(business_vocab_list, item_type_dict): items_feature_columns = [] bucketized_boundary = {'stars': [2.5, 4]} embedding_size = {"categories": 8, "city": 4} for k, v in business_vocab_list.items(): if k in ['review_count']: col = numeric_column(k, default_value=0, dtype=item_type_dict[k]) elif k in ['stars']: col = bucketized_column( numeric_column(k, default_value=0, dtype=item_type_dict[k]), bucketized_boundary[k]) elif k in ['categories', 'city']: col = embedding_column(categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k]), dimension=embedding_size[k]) else: col = indicator_column( categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k])) items_feature_columns.append(col) return items_feature_columns
def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): """Generate a feature column from a categorical string data set Parameters ---------- name : str Name of categorical columns data : np.ndarray | list String data array vocab_threshold : int Number of unique entries in the data array below which this will use a vocabulary list, above which a hash bucket will be used. bucket_size : int Hash bucket size. Returns ------- f_col : IndicatorColumn Categorical feature column. """ n_unique = len(set(data)) if n_unique < vocab_threshold: f_col = feature_column.categorical_column_with_vocabulary_list( name, list(set(data))) else: f_col = feature_column.categorical_column_with_hash_bucket( name, bucket_size) f_col = feature_column.indicator_column(f_col) return f_col
def define_feature_columns(dataframe): print("Defining feature columns...") feature_columns = [] # Create embedding column for name IDs name_id = feature_column.categorical_column_with_vocabulary_list( 'nconst', dataframe.nconst.unique()) # Dimension set to 30 (approximately fourth root of the number of unique name IDs) name_id_embedding = feature_column.embedding_column(name_id, dimension=30) feature_columns.append(name_id_embedding) # Create indicator columns for category and genres indicator_column_names = ['category', 'genres'] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) # Create bucketized column for startYear (a.k.a. release date) start_year_numeric = feature_column.numeric_column('startYear') start_year_bucket = feature_column.bucketized_column( start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015]) feature_columns.append(start_year_bucket) print("Feature columns defined") return feature_columns
def test_weighted_categorical_column(): # 1. Input features color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # 2. Feature columns (Sparse) color_weight_categorical_column \ = feature_column.weighted_categorical_column(color_column, 'weight') builder = _LazyBuilder(color_data) id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight])) # 2. Feature columns (Dense) weighted_column = feature_column.indicator_column( color_weight_categorical_column) # 3. Feature tensor weighted_column_dense_tensor = feature_column.input_layer( color_data, [weighted_column]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([weighted_column_dense_tensor]))
def test_categorical_column_with_hash_bucket(): #源数据 color_data = {'color': [[2], [5], [-1], [0]]} # 4行样本 shape=[4,1] builder = _LazyBuilder(color_data) # categorical_column color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) # tensor color_column_tensor = color_column._get_sparse_tensors(builder) #稀疏表示 with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) #input_layer连接数据源和声明的column生成新的tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def _get_tf_feature_cols(dataframe: pd.DataFrame): feature_columns = [] # numeric cols for header in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(header)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column( age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column( categorical_column) feature_columns.append(indicator_column) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=8) feature_columns.append(breed1_embedding) return feature_columns
def hash_embedding(self, hash_bucket, embedding_dim, name): cate_feature = feature_column.categorical_column_with_hash_bucket( name, hash_bucket, dtype=tf.string) emb_col = feature_column.embedding_column(cate_feature, dimension=embedding_dim, combiner='mean') ind_col = feature_column.indicator_column(cate_feature) return emb_col, ind_col
def hashed_columns(self, hashed_columns_dict): ### Independance for col_name, bucket_size in hashed_columns_dict.items(): hashedCol = feature_column.categorical_column_with_hash_bucket( col_name, hash_bucket_size=bucket_size) hashedFeature = feature_column.indicator_column(hashedCol) self.sparse_columns[col_name] = hashedFeature return hashedFeature
def create_feature_columns(): # user feature phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("feature columns:", my_feature_columns) return my_feature_columns
def create_feature_layer(df): week = feature_column.numeric_column("Week") boundaries = [] for i in range(1, 53): boundaries.append(i) week = feature_column.bucketized_column(week, boundaries=boundaries) day = feature_column.numeric_column("Day") boundaries = [] for i in range(1, 8): boundaries.append(i) day = feature_column.bucketized_column(day, boundaries=boundaries) year = feature_column.numeric_column("Year") boundaries = [] for i in range(2013, 2017): boundaries.append(i) year = feature_column.bucketized_column(year, boundaries=boundaries) hour = feature_column.numeric_column("std_hour") boundaries = [] for i in range(0, 24): boundaries.append(i) hour = feature_column.bucketized_column(hour, boundaries=boundaries) arrival = feature_column.categorical_column_with_vocabulary_list( "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist()) airline = feature_column.categorical_column_with_vocabulary_list( "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist()) flight_no = feature_column.categorical_column_with_vocabulary_list( "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist()) arrival_one_hot = feature_column.indicator_column(arrival) airline_one_hot = feature_column.indicator_column(airline) flight_no_one_hot = feature_column.indicator_column(flight_no) arrival_length = len(pd.Series.unique(df.Arrival).tolist()) arrival_and_week = feature_column.crossed_column( [arrival, week], hash_bucket_size=(arrival_length * 52)) arrival_and_week = feature_column.indicator_column(arrival_and_week) airline_length = len(pd.Series.unique(df.Airline).tolist()) year_and_airline = feature_column.crossed_column( [year, airline], hash_bucket_size=(airline_length * 4)) year_and_airline = feature_column.indicator_column(year_and_airline) feature_columns = [] feature_columns = feature_columns + [ week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour, arrival_and_week, year, year_and_airline ] feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def _base(): education_num=fc.numeric_column('education_num') capital_gain=fc.numeric_column('capital_gain') capital_loss=fc.numeric_column('capital_loss') hours_per_week=fc.numeric_column('hours_per_week') #categorical,embedding_column relationship=fc.categorical_column_with_vocabulary_file('relationship',vocabulary_file='data/relationship') relationship=fc.indicator_column(relationship) education=fc.categorical_column_with_vocabulary_file('education',vocabulary_file='data/education') education=fc.indicator_column(education) race=fc.categorical_column_with_vocabulary_file('race',vocabulary_file='data/race') race=fc.indicator_column(race) occupation=fc.indicator_column(fc.categorical_column_with_hash_bucket('occupation',20)) return [education_num,capital_gain,capital_loss,hours_per_week,relationship,education,race,occupation]
def crossed_feature_columns(self, columns_crossed, nameOfLayer, bucket_size=10): crossed_feature = feature_column.crossed_column( columns_crossed, hash_bucket_size=bucket_size) crossed_feature = feature_column.indicator_column(crossed_feature) self.sparse_columns[nameOfLayer] = crossed_feature return crossed_feature
def create_deep_feature_columns(): phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) columns = [matchScore, matchType, triggerNum, triggerRank, sceneType, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("deep feature columns:", columns) return columns