def main(): URL = 'https://storage.googleapis.com/applied-dl/heart.csv' dataframe = pd.read_csv(URL) print(dataframe.head()) train, test = train_test_split(dataframe, test_size=0.2) train, val = train_test_split(train, test_size=0.2) print(len(train), 'train examples') print(len(val), 'validation examples') print(len(test), 'test examples') batch_size = 5 # 小批量大小用于演示 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('Every feature:', list(feature_batch.keys())) print('A batch of ages:', feature_batch['age']) print('A batch of targets:', label_batch) # 我们将使用该批数据演示几种特征列 example_batch = next(iter(train_ds))[0] # 用于创建一个特征列 # 并转换一批次数据的一个实用程序方法 def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) age = feature_column.numeric_column("age") demo(age) age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) demo(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) demo(thal_one_hot) # 注意到嵌入列的输入是我们之前创建的类别列 thal_embedding = feature_column.embedding_column(thal, dimension=8) demo(thal_embedding) thal_hashed = feature_column.categorical_column_with_hash_bucket( 'thal', hash_bucket_size=1000) demo(feature_column.indicator_column(thal_hashed)) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) demo(feature_column.indicator_column(crossed_feature)) feature_columns = [] # 数值列 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 分桶列 age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 分类列 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 嵌入列 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 组合列 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) batch_size = 32 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True) model.fit(train_ds, validation_data=val_ds, epochs=5) loss, accuracy = model.evaluate(test_ds) print("Accuracy", accuracy)
def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe = dataframe.copy() labels = dataframe.pop('species') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds feature_columns = [] for header in ['sl', 'sw', 'pl', 'pw']: feature_columns.append(feature_column.numeric_column(header)) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) batch_size = 32 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1, activation='sigmoid') ])
def create_feature_columns(): # user feature gender = fc.indicator_column( fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column( fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column( fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column( fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column( fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column( fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column( fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) cate1_price_prefer = fc.indicator_column( fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0)) cate2_price_prefer = fc.indicator_column( fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0)) cate3_price_prefer = fc.indicator_column( fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.embedding_column(city_id, 16) shop_visit_cnt = fc.indicator_column( fc.categorical_column_with_identity("shop_visit_cnt_rank", num_buckets=20, default_value=19)) shop_visit_usr = fc.indicator_column( fc.categorical_column_with_identity("shop_visit_usr_rank", num_buckets=20, default_value=19)) # item feature c2id = fc.categorical_column_with_hash_bucket("cate2Id", 10000, dtype=tf.int64) c2id_embed = fc.embedding_column(c2id, 32) modified_time = fc.numeric_column("modified_time", default_value=0.0) modified_time_sqrt = fc.numeric_column("modified_time_sqrt", default_value=0.0) modified_time_square = fc.numeric_column("modified_time_square", default_value=0.0) props_sex = fc.indicator_column( fc.categorical_column_with_vocabulary_list("props_sex", ["男", "女", "通用", "情侣"], default_value=0)) brand_grade = fc.indicator_column( fc.categorical_column_with_vocabulary_list( "brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0)) shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0) shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0) ipv_ntile = fc.bucketized_column( fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80]) pay_ntile = fc.bucketized_column( fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80]) price = fc.numeric_column("price_norm", default_value=0.0) ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0) cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0) uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0) ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0) cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0) uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0) ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0) cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0) uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0) ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0) cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0) uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0) pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0) pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0) pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0) pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0) cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0) cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0) brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0) slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0) slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0) slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0) slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0) slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d", default_value=0.0) slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w", default_value=0.0) slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w", default_value=0.0) slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m", default_value=0.0) weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0) cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv", default_value=0.0) cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv", default_value=0.0) slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0) brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0) cms_scale = fc.numeric_column("cms_scale", default_value=0.0) cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column( fc.categorical_column_with_identity("matchType", 9, default_value=0)) position = fc.bucketized_column(fc.numeric_column("position", dtype=tf.int64, default_value=301), boundaries=[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 80, 100, 150, 200, 300 ]) triggerNum = fc.indicator_column( fc.categorical_column_with_identity("triggerNum", 41, default_value=40)) triggerRank = fc.indicator_column( fc.categorical_column_with_identity("triggerRank", 41, default_value=40)) sceneType = fc.indicator_column( fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column( fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket( "phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) global my_feature_columns my_feature_columns = [ matchScore, matchType, position, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer, gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, city, price, props_sex, brand_grade, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer, modified_time, modified_time_sqrt, modified_time_square, shipment_rate, shipping_rate, ipv_ntile, pay_ntile, shop_visit_cnt, shop_visit_usr, c2id_embed, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m, ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m, pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty, slr_pay_qty_1d, slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m, slr_brd_pay_qty_1d, slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m, weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv, brd_weighted_ipv, cms_scale, cms_scale_sqrt ] return my_feature_columns
def transform_from_code_gen(source_inputs): education_hash_fc = fc.categorical_column_with_hash_bucket( "education", hash_bucket_size=education_hash.hash_bucket_size) occupation_hash_fc = fc.categorical_column_with_hash_bucket( "occupation", hash_bucket_size=occupation_hash.hash_bucket_size) native_country_hash_fc = fc.categorical_column_with_hash_bucket( "native_country", hash_bucket_size=native_country_hash.hash_bucket_size) workclass_lookup_fc = fc.categorical_column_with_vocabulary_list( "workclass", vocabulary_list=workclass_lookup.vocabulary_list) marital_status_lookup_fc = fc.categorical_column_with_vocabulary_list( "marital_status", vocabulary_list=marital_status_lookup.vocabulary_list) relationship_lookup_fc = fc.categorical_column_with_vocabulary_list( "relationship", vocabulary_list=relationship_lookup.vocabulary_list) race_lookup_fc = fc.categorical_column_with_vocabulary_list( "race", vocabulary_list=race_lookup.vocabulary_list) sex_lookup_fc = fc.categorical_column_with_vocabulary_list( "sex", vocabulary_list=sex_lookup.vocabulary_list) age_bucketize_fc = fc.bucketized_column( fc.numeric_column("age"), boundaries=age_bucketize.boundaries) capital_gain_bucketize_fc = fc.bucketized_column( fc.numeric_column("capital_gain"), boundaries=capital_gain_bucketize.boundaries, ) capital_loss_bucketize_fc = fc.bucketized_column( fc.numeric_column("capital_loss"), boundaries=capital_loss_bucketize.boundaries, ) hours_per_week_bucketize_fc = fc.bucketized_column( fc.numeric_column("hours_per_week"), boundaries=hours_per_week_bucketize.boundaries, ) group1_fc = edl_fc.concatenated_categorical_column(categorical_columns=[ workclass_lookup_fc, hours_per_week_bucketize_fc, capital_gain_bucketize_fc, capital_loss_bucketize_fc, ]) group2_fc = edl_fc.concatenated_categorical_column(categorical_columns=[ education_hash_fc, marital_status_lookup_fc, relationship_lookup_fc, occupation_hash_fc, ]) group3_fc = edl_fc.concatenated_categorical_column(categorical_columns=[ age_bucketize_fc, sex_lookup_fc, race_lookup_fc, native_country_hash_fc, ]) group1_wide_embedding_fc = fc.embedding_column( group1_fc, dimension=group1_embedding_wide.output_dim, ) group2_wide_embedding_fc = fc.embedding_column( group2_fc, dimension=group2_embedding_wide.output_dim, ) group1_deep_embedding_fc = fc.embedding_column( group1_fc, dimension=group1_embedding_deep.output_dim, ) group2_deep_embedding_fc = fc.embedding_column( group2_fc, dimension=group2_embedding_deep.output_dim, ) group3_deep_embedding_fc = fc.embedding_column( group3_fc, dimension=group3_embedding_deep.output_dim, ) wide_feature_columns = [ group1_wide_embedding_fc, group2_wide_embedding_fc, ] deep_feature_columns = [ group1_deep_embedding_fc, group2_deep_embedding_fc, group3_deep_embedding_fc, ] return ( tf.keras.layers.DenseFeatures(wide_feature_columns)(source_inputs), tf.keras.layers.DenseFeatures(deep_feature_columns)(source_inputs), )
label_key='charged_off', num_epochs=5, shuffle=True, batch_size=20000) #300000 #230934 #val_inpf = functools.partial(easy_input_function, val_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=val_df.shape[0]) #200000 test_inpf = functools.partial(easy_input_function, test_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=test_df.shape[0]) #200000 ################################################################### #DEFINE ALL NUMERIC COLUMNS loan_amnt = fc.numeric_column('loan_amnt') term = fc.numeric_column('term') installment = fc.numeric_column('installment') emp_length = fc.numeric_column('emp_length') dti = fc.numeric_column('dti') earliest_cr_line = fc.numeric_column('earliest_cr_line') open_acc = fc.numeric_column('open_acc') pub_rec = fc.numeric_column('pub_rec') revol_util = fc.numeric_column('revol_util') total_acc = fc.numeric_column('total_acc') mort_acc = fc.numeric_column('mort_acc') pub_rec_bankruptcies = fc.numeric_column('pub_rec_bankruptcies') log_annual_inc = fc.numeric_column('log_annual_inc') fico_score = fc.numeric_column('fico_score') log_revol_bal = fc.numeric_column('log_revol_bal')
if isinstance(x_train, np.ndarray): print("data have been loaded as numpy array") features = [' CRIM', 'ZN', 'INDUS', ' CHAS', ' NOX', ' RM', ' AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] x_train_df = pd.DataFrame(data=x_train, columns=features) x_test_df = pd.DataFrame(data=x_test, columns=features) y_train_df = pd.DataFrame(data=y_train, columns=['price']) y_test_df = pd.DataFrame(data=y_test, columns=['price']) print(x_train_df.head()) feature_columns = [] for feature_name in features: feature_columns.append(feature_column.numeric_column(feature_name, dtype=tf.float32)) """ Have to create an input pipeline using tf.data """ def estimator_input_fn(df_data, df_label, epochs=10, shuffle=True, batch_size=32): def input_function(): ds = tf.data.Dataset.from_tensor_slices((dict(df_data), df_label)) if shuffle: ds = ds.shuffle(100) ds = ds.batch(batch_size).repeat(epochs) return ds return input_function
labels_one_hot = to_categorical(labels, num_classes=40) ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels_one_hot)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds batch_size = 20 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) feature_columns = [] fc_age = feature_column.numeric_column('age') fc_age_buckets = feature_column.bucketized_column(fc_age, boundaries=[20, 30]) feature_columns.append(fc_age_buckets) fc_gender = feature_column.categorical_column_with_vocabulary_list( 'gender', ['Male', 'Female']) fc_gender_one_hot = feature_column.indicator_column(fc_gender) feature_columns.append(fc_gender_one_hot) fc_emotion = feature_column.categorical_column_with_vocabulary_list( 'emotion', ['Happy', 'Sad', 'Fear', 'Disgust', 'Anger', 'Surprice']) fc_emotion_one_hot = feature_column.indicator_column(fc_emotion) feature_columns.append(fc_emotion_one_hot) fc_color = feature_column.categorical_column_with_vocabulary_list( 'color', ['Red', 'Blue', 'Green', 'White', 'Gray'])
def create_feature_columns(self): feature_columns = [] numeric_cols = [ 'owner_influence', 'is_commented_by_connections', 'is_liked_by_me', 'is_liked_by_connections', 'poster_gender', 'poster_influence', 'participant1_gender', 'participant1_influence', 'participant2_gender', 'participant2_influence', 'participant3_gender', 'participant3_influence' ] # numeric cols for header in numeric_cols: feature_columns.append(feature_column.numeric_column(header)) # bucketized columns # age step = len(self.df.age) // 8 sorted_ages = sorted(self.df.age) age_boundaries = [sorted_ages[i * step] for i in range(1, 8)] age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=age_boundaries) feature_columns.append(age_buckets) # number_of_likes likes_num = feature_column.numeric_column("number_of_likes") likes_num_buckets = feature_column.bucketized_column( likes_num, boundaries=[2, 5, 10, 20, 50, 100]) feature_columns.append(likes_num_buckets) # number_of_comments comments_num = feature_column.numeric_column("number_of_comments") comments_num_buckets = feature_column.bucketized_column( comments_num, boundaries=[1, 2, 5, 10, 20, 50, 100]) feature_columns.append(comments_num_buckets) # indicator columns for categorical features app_type = feature_column.categorical_column_with_vocabulary_list( 'app_type', self.df.app_type.unique()) app_type_1hot = feature_column.indicator_column(app_type) feature_columns.append(app_type_1hot) owner_type = feature_column.categorical_column_with_vocabulary_list( 'owner_type', self.df.owner_type.unique()) owner_type_1hot = feature_column.indicator_column(owner_type) feature_columns.append(owner_type_1hot) poster_focus = feature_column.categorical_column_with_vocabulary_list( 'poster_focus', [ 'engineering', 'sales', 'marketing', 'management', 'financial', 'other' ]) poster_focus_1hot = feature_column.indicator_column(poster_focus) feature_columns.append(poster_focus_1hot) # functions to reduce code duplication def participant_action(part_action): participant_action = feature_column.categorical_column_with_vocabulary_list( part_action, ['commented', 'liked', 'viewed']) return participant_action def participant_focus(part_f): participant_focus = feature_column.categorical_column_with_vocabulary_list( part_f, [ 'engineering', 'sales', 'marketing', 'management', 'financial', 'other', 'none' ]) return participant_focus participant1_action = participant_action("participant1_action") participant2_action = participant_action("participant2_action") participant3_action = participant_action("participant3_action") participant1_focus = participant_focus("participant1_focus") participant2_focus = participant_focus("participant2_focus") participant3_focus = participant_focus("participant3_focus") feature_columns.append( feature_column.indicator_column(participant1_action)) feature_columns.append( feature_column.indicator_column(participant1_focus)) feature_columns.append( feature_column.indicator_column(participant2_action)) feature_columns.append( feature_column.indicator_column(participant2_focus)) feature_columns.append( feature_column.indicator_column(participant3_action)) feature_columns.append( feature_column.indicator_column(participant3_focus)) # feature crosses for participant action and focus crossed_feature1 = feature_column.crossed_column( [participant1_action, participant1_focus], hash_bucket_size=1000) crossed_feature1 = feature_column.indicator_column(crossed_feature1) feature_columns.append(crossed_feature1) crossed_feature2 = feature_column.crossed_column( [participant2_action, participant2_focus], hash_bucket_size=1000) crossed_feature2 = feature_column.indicator_column(crossed_feature2) feature_columns.append(crossed_feature2) crossed_feature3 = feature_column.crossed_column( [participant3_action, participant3_focus], hash_bucket_size=1000) crossed_feature3 = feature_column.indicator_column(crossed_feature3) feature_columns.append(crossed_feature3) self.feature_columns = feature_columns
print("Every feature:", list(feature_batch.keys())) print("A batch of ages", feature_batch["Age"]) print("A batch of targets", label_batch) example_batch = next(iter(train_ds))[0] # 一个批量的数据 def demo(feature_column): # 显示相关数据 # 进行显示 feature_layer = layers.DenseFeatures(feature_columns=feature_column) print(feature_layer(example_batch).numpy()) feature_columns = [] # ---------------------------------------数值列-------------------------------------------- for header in ["PhotoAmt", "Fee", "Age"]: feature_columns.append(feature_column.numeric_column(header)) # 测试 # photo_count = feature_column.numeric_column('PhotoAmt') # demo(photo_count) # --------------------------------------分桶列-------------------------------------------------- age = feature_column.numeric_column(key="Age") age_buckets = feature_column.bucketized_column(source_column=age, boundaries=[1, 2, 3, 4, 5]) # 测试 # demo(age_buckets) feature_columns.append(age_buckets) # --------------------------------------种类列-------------------------------------------------- animal_type = feature_column.categorical_column_with_vocabulary_list( key="Type", vocabulary_list=["Cat", "Dog"])
if options['distribute']: return dataset else: return dataset.make_one_shot_iterator().get_next() return _input_fn # # The input layer: See Feature_Engineering.ipynb for explanations # from tensorflow.feature_column import numeric_column from tensorflow.feature_column import crossed_column from tensorflow.feature_column import indicator_column from tensorflow.feature_column import categorical_column_with_identity from tensorflow_transform.tf_metadata import dataset_schema beta1 = numeric_column('beta1') beta2 = numeric_column('beta2') weekday = categorical_column_with_identity('weekday', num_buckets=7) hour = categorical_column_with_identity('hour', num_buckets=24) hour_of_week = indicator_column(crossed_column([weekday, hour], 24 * 7)) all_feature_columns = [beta1, beta2, hour_of_week] def input_layer(features): return tf.feature_column.input_layer(features, feature_columns=all_feature_columns)
# ax.scatter3D(data_train["Temp"], data_train["QDot"], data_train["HTC"], c='k', marker='x', label="Train") # ax.set_xlabel("Temp") # ax.set_ylabel("QDot") # ax.set_zlabel("HTC") # plt.legend() #plt.show() data_train.head() #%% # Feature Columns: As for any other TF estimator, data needs to be passed to the estimator, # which is typically via an input_fn and parsed using FeatureColumns. feature_columns = [ fc.numeric_column("Temp"), fc.numeric_column("QDot") ] # creating input_fn: As for any other estimator, you can use an input_fn to feed data to the # model for training and evaluation. TFL estimators automatically calculate quantiles of # the features and use them as input keypoints for the PWL calibration layer. To do so, # they require passing a feature_analysis_input_fn, which is similar to the training # input_fn but with a single epoch or a subsample of the data. train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( x=data_train[data_train.columns[:2]], y=data_train["HTC"], shuffle=False, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS, num_threads=1
#demo(feature_column.indicator_column(crossed_feature)) feature_columns = [] #Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset #feature_columns.append(gender_one_hot) #feature_columns.append(age_buckets) #feature_columns.append(age) # numeric cols for header in numeric_columns: print('Printing header') print(header) feature_columns.append( feature_column.numeric_column(header, dtype=tf.float64)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90]) gender = feature_column.categorical_column_with_vocabulary_list( 'Gender', ['Male', 'Female']) gender_one_hot = feature_column.indicator_column(gender) feature_columns.append(age_buckets) feature_columns.append(gender_one_hot)
train_ds = df_to_dataset(train, shuffle=True, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=True, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=True, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print("dnesity feature", feature_batch['density']) example_batch = next(iter(train_ds))[0] def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) dioxide = feature_column.numeric_column('free_sulfur_dioxide') demo(dioxide) dioxide_buckets = feature_column.bucketized_column( dioxide, boundaries=[2, 6, 10, 14, 18, 24, 30, 50]) demo(dioxide_buckets) # fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality #0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 feature_columns = [] for header in [ 'fixed_acidity', 'volatile_acidity', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol'
print('Grupo de objetivos: ', label_batch) # We will use this batch to demonstrate several types of feature columns example_batch = next(iter(train_ds))[0] # A utility method to create a feature column # and to transform a batch of data def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) feature_columns = [] # Creamos diferentes columnas para ver resultados de cada caracteristica age = feature_column.numeric_column("edad") demo(age) age_buckets = feature_column.bucketized_column(age, boundaries=[16, 25, 30, 35, 40, 45, 50]) demo(age_buckets) feature_columns.append(age_buckets) genero = feature_column.categorical_column_with_vocabulary_list( 'genero', ['Hombre', 'Mujer']) genero_one_hot = feature_column.indicator_column(genero) demo(genero_one_hot) feature_columns.append(genero_one_hot) ubicacion = feature_column.categorical_column_with_vocabulary_list( 'ubicacion', ['Pueblo', 'Ciudad'])
def get_compiled_model(headers,targetGroup,denseNum): # model = tf.keras.Sequential([ # tf.keras.layers.Dense(10, activation='relu'), # tf.keras.layers.Dense(10, activation='relu'), # tf.keras.layers.Dense(1, activation='sigmoid') # ]) # model.compile(optimizer='adam', # loss='binary_crossentropy', # metrics=['accuracy']) # model = keras.models.Sequential() # for _ in range(20): # model.add(keras.layers.Dense(10, activation="relu")) # # model.add(keras.layers.AlphaDropout(rate=0.5)) # # AlphaDropout: 1. 均值和方差不变 2. 归一化性质也不变 # model.add(keras.layers.Dense(1, activation="sigmoid")) feature_columns = [] for header in headers: feature_columns.append(feature_column.numeric_column(header)) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) model = keras.models.Sequential() # model.add(keras.layers.Flatten(input_shape=(0,1, len(headers))),) # model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(, return_sequences=True))) # for _ in range(20): # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.Dense(10)) # # model.add(keras.layers.Dense(10, activation="selu")) # # model.add(keras.layers.Dense(len(headers), activation="relu")) # model.add(keras.layers.Dense(len(headers), activation="relu")) # model.add(keras.layers.AlphaDropout(rate=0.5)) # model.add(keras.layers.Dense(10, activation="relu")) # model.add(tf.keras.layers.Dense(128,activation='relu')) # model.add(tf.keras.layers.Flatten()) # AlphaDropout: 1. 均值和方差不变 2. 归一化性质也不变 # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.Dense(3)) if len(targetGroup) > 2: print('222222222222222222222222222222222222222222') denseCount =len(targetGroup) # model.add(keras.layers.Dense(denseCount, activation="softmax")) # model.add(keras.layers.Flatten()) # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.Flatten()) # model.add(keras.layers.Dense(len(headers), activation='relu')) # for _ in range(2): # model.add(keras.layers.Dense(10*len(headers), activation='relu')) # for _ in range(2): # model.add(keras.layers.Dense(5*len(headers), activation='relu')) # model.add(keras.layers.Dense(10*len(headers), activation='relu')) # for _ in range(denseNum): # model.add(keras.layers.Dense(len(headers), activation='relu')) # # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.AlphaDropout(rate=0.5)) # # model.add(keras.layers.Dense(denseCount)) # model.add(tf.keras.layers.Dense(denseCount, activation='softmax')) # model.add(tf.keras.layers.Dense(denseCount)) # model.compile(optimizer='adam', # loss='sparse_categorical_crossentropy', # metrics=['accuracy']) # model.compile(optimizer= tf.keras.optimizers.Adam(), # loss= tf.keras.losses.SparseCategoricalCrossentropy(), # metrics=['accuracy']) model = keras.Sequential([ # keras.layers.Dense(64, activation='relu',input_dim=len(headers)), keras.layers.Dense(64, activation='relu',input_dim=len(headers)), keras.layers.Dense(64, activation='relu'), keras.layers.AlphaDropout(rate=0.5), # keras.layers.Dense(len(headers), activation='relu'), keras.layers.Dense(denseCount, activation='softmax') ]) model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), # loss=keras.losses.BinaryCrossentropy(), # metrics=[keras.metrics.SparseCategoricalAccuracy()]) # model.add(tf.keras.layers.Dense(denseCount)) # model.add(tf.keras.layers.Dense(denseCount)) # model.add(tf.keras.layers.Dense(denseCount)) # model.compile(optimizer='adam', # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # metrics=["accuracy"]) # model.compile(optimizer= tf.keras.optimizers.Adam(lr=1e-3), # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # metrics=["accuracy"]) # model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # metrics=["accuracy"]) # model.compile(optimizer=tf.keras.optimizers.Adam(), # loss='sparse_categorical_crossentropy', # metrics=["accuracy"]) # model.add(keras.layers.Dense(denseCount)) # model.compile(optimizer='adam', # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # metrics=['accuracy']) else: # for _ in range(denseNum): # model.add(keras.layers.Dense(len(headers), activation='relu')) # # model.add(keras.layers.Dense(10, activation='relu')) # # model.add(keras.layers.Dense(len(headers), activation='relu')) # # model.add(keras.layers.Dense(len(headers), activation='relu')) # # model.add(keras.layers.Dense(len(headers), activation='relu')) # model.add(keras.layers.AlphaDropout(rate=0.5)) # model.add(keras.layers.Dense(1, activation="sigmoid")) model = keras.Sequential([ # keras.layers.Dense(64, activation='relu',input_dim=len(headers)), keras.layers.Dense(64, activation='relu',input_dim=len(headers)), keras.layers.Dense(64, activation='relu'), keras.layers.AlphaDropout(rate=0.5), # keras.layers.Dense(len(headers), activation='relu'), keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True) # model.summary() # model.compile( # optimizer='adam', # loss='categorical_crossentropy', # metrics=['accuracy'], # run_eagerly=True, # # optimizer='sgd' # ) # model.add(keras.layers.Dense(denseCount, activation="sigmoid")) # 配置 SGD,学习率为 0.1 # optimizer = tf.keras.optimizers.SGD(0.1) # model.compile(optimizer=optimizer, # loss = loss, # metrics=['accuracy']) # model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy',metrics = ['accuracy']) # model = tf.keras.Sequential([ # feature_layer, # keras.layers.Dense(128, activation='relu'), # keras.layers.Dense(128, activation='relu'), # keras.layers.Dense(1, activation='sigmoid') # ]) # layer0 = tf.keras.layers.Dense(class_num, input_shape=(x_data.shape[1],), activation='softmax') # model = tf.keras.Sequential([layer0]) # model.compile(loss='categorical_crossentropy', optimizer='adam') # #首先先将数据集归一化 # train_image = train_image/255 # test_image = test_image/255 # #创建model # model = tf.keras.Sequential() # #添加隐含层 # model.add(tf.keras.layers.Flatten(input_shape=(28,28))) # model.add(tf.keras.layers.Dense(128,activation = 'relu')) # model.add(tf.keras.layers.Dense(10,activation = 'softmax')) # #编译model # model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy',metrics = ['acc']) # model.compile(loss='categorical_crossentropy', optimizer='adam') # #训练model # model.fit(train_image , train_label,epochs = 5) # #进行model的预测 # model.predict(test_image , test_label) # model.compile( # optimizer='adam', # loss='categorical_crossentropy', # metrics=['accuracy'], # run_eagerly=True, # # optimizer='sgd' # ) # model.compile(optimizer=tf.keras.optimizers.RMSprop(), # loss=tf.keras.losses.SparseCategoricalCrossentropy(), # metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]) # model.compile(optimizer='adam', # loss='binary_crossentropy', # metrics=['accuracy'], # run_eagerly=True) return model
print(numeric_data) #import pdb; pdb.set_trace() #demo(numeric_data) #pdb.set_trace() print('GENDER============>') gender = feature_column.categorical_column_with_vocabulary_list( 'Gender', ['Male', 'Female']) gender_one_hot = feature_column.indicator_column(gender) print('Gender one hot') print(gender_one_hot) pdb.set_trace() print('AGE==============>') age = feature_column.numeric_column('Age') print("Demo AGE") demo(age) age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90]) print(age_buckets) pdb.set_trace() numeric_data['Age_Buckets']= pd.DataFrame(age_buckets) numeric_data['Gender'] = pd.DataFrame(gender_one_hot) print('DEBUG==========>') print(numeric_data) print(age_buckets_df) print('Gender one hot') print(gender_one_hot)
def main(_): # Parse configs updates from command line flags. config_updates = [] for update in FLAGS.config_updates: config_updates.extend(re.findall(r'(\S*)\s*=\s*(\S*)', update)) # UCI Statlog (Heart) dataset. csv_file = tf.keras.utils.get_file( 'heart.csv', 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv') df = pd.read_csv(csv_file) target = df.pop('target') train_size = int(len(df) * 0.8) train_x = df[:train_size] train_y = target[:train_size] test_x = df[train_size:] test_y = target[train_size:] # feature_analysis_input_fn is used to collect statistics about the input # features, thus requiring only one loop of the dataset. # # feature_analysis_input_fn is required if you have at least one FeatureConfig # with "pwl_calibration_input_keypoints='quantiles'". Note that 'quantiles' is # default keypoints configuration so most likely you'll need it. feature_analysis_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( x=train_x, y=train_y, shuffle=False, batch_size=FLAGS.batch_size, num_epochs=1, num_threads=1) # prefitting_input_fn is used to prefit an initial ensemble that is used to # estimate feature interactions. This prefitting step does not need to fully # converge and thus requiring fewer epochs than the main training. # # prefitting_input_fn is only required if your model_config is # CalibratedLatticeEnsembleConfig with "lattices='crystals'" prefitting_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( x=train_x, y=train_y, shuffle=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.prefitting_num_epochs, num_threads=1) train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( x=train_x, y=train_y, shuffle=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, num_threads=1) test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn( x=test_x, y=test_y, shuffle=False, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, num_threads=1) # Feature columns. # - age # - sex # - cp chest pain type (4 values) # - trestbps resting blood pressure # - chol serum cholestoral in mg/dl # - fbs fasting blood sugar > 120 mg/dl # - restecg resting electrocardiographic results (values 0,1,2) # - thalach maximum heart rate achieved # - exang exercise induced angina # - oldpeak ST depression induced by exercise relative to rest # - slope the slope of the peak exercise ST segment # - ca number of major vessels (0-3) colored by flourosopy # - thal 3 = normal; 6 = fixed defect; 7 = reversable defect feature_columns = [ fc.numeric_column('age', default_value=-1), fc.categorical_column_with_vocabulary_list('sex', [0, 1]), fc.numeric_column('cp'), fc.numeric_column('trestbps', default_value=-1), fc.numeric_column('chol'), fc.categorical_column_with_vocabulary_list('fbs', [0, 1]), fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]), fc.numeric_column('thalach'), fc.categorical_column_with_vocabulary_list('exang', [0, 1]), fc.numeric_column('oldpeak'), fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]), fc.numeric_column('ca'), fc.categorical_column_with_vocabulary_list( 'thal', ['normal', 'fixed', 'reversible']), ] # Feature configs are used to specify how each feature is calibrated and used. feature_configs = [ configs.FeatureConfig( name='age', lattice_size=3, # By default, input keypoints of pwl are quantiles of the feature. pwl_calibration_num_keypoints=5, monotonicity='increasing', pwl_calibration_clip_max=100, ), configs.FeatureConfig( name='cp', pwl_calibration_num_keypoints=4, # Keypoints can be uniformly spaced. pwl_calibration_input_keypoints='uniform', monotonicity='increasing', ), configs.FeatureConfig( name='chol', # Explicit input keypoint initialization. pwl_calibration_input_keypoints=[ 126.0, 210.0, 247.0, 286.0, 564.0 ], monotonicity='increasing', pwl_calibration_clip_min=130, # Calibration can be forced to span the full output range by clamping. pwl_calibration_clamp_min=True, pwl_calibration_clamp_max=True, # Per feature regularization. regularizer_configs=[ configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ], ), configs.FeatureConfig( name='fbs', # Monotonicity: output for 1 should be larger than output for 0. monotonicity=[(0, 1)], ), configs.FeatureConfig( name='trestbps', pwl_calibration_num_keypoints=5, monotonicity='decreasing', ), configs.FeatureConfig( name='thalach', pwl_calibration_num_keypoints=5, monotonicity='decreasing', ), configs.FeatureConfig( name='restecg', # Categorical monotonicity can be partial order. monotonicity=[(0, 1), (0, 2)], ), configs.FeatureConfig( name='exang', monotonicity=[(0, 1)], ), configs.FeatureConfig( name='oldpeak', pwl_calibration_num_keypoints=5, monotonicity='increasing', ), configs.FeatureConfig( name='slope', monotonicity=[(0, 1), (1, 2)], ), configs.FeatureConfig( name='ca', pwl_calibration_num_keypoints=4, monotonicity='increasing', ), configs.FeatureConfig( name='thal', monotonicity=[('normal', 'fixed'), ('normal', 'reversible')], ), ] # Serving input fn is used to create saved models. serving_input_fn = ( tf.estimator.export.build_parsing_serving_input_receiver_fn( feature_spec=fc.make_parse_example_spec(feature_columns))) # Model config defines the model strcutre for the estimator. # This is calibrated linear model with outputput calibration: Inputs are # calibrated, linearly combined and the output of the linear layer is # calibrated again using a PWL function. model_config = configs.CalibratedLinearConfig( feature_configs=feature_configs, use_bias=True, output_calibration=True, regularizer_configs=[ # Regularizer for the output calibrator. configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4), ]) # Update model configuration. # See tfl.configs.apply_updates for details. configs.apply_updates(model_config, config_updates) estimator = estimators.CannedClassifier( feature_columns=feature_columns, model_config=model_config, feature_analysis_input_fn=feature_analysis_input_fn, optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate)) estimator.train(input_fn=train_input_fn) results = estimator.evaluate(input_fn=test_input_fn) print('Calibrated linear results: {}'.format(results)) print('Calibrated linear model exported to {}'.format( estimator.export_saved_model(estimator.model_dir, serving_input_fn))) # This is calibrated lattice model: Inputs are calibrated, then combined # non-linearly using a lattice layer. model_config = configs.CalibratedLatticeConfig( feature_configs=feature_configs, regularizer_configs=[ # Torsion regularizer applied to the lattice to make it more linear. configs.RegularizerConfig(name='torsion', l2=1e-4), # Globally defined calibration regularizer is applied to all features. configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ]) estimator = estimators.CannedClassifier( feature_columns=feature_columns, model_config=model_config, feature_analysis_input_fn=feature_analysis_input_fn, optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate)) estimator.train(input_fn=train_input_fn) results = estimator.evaluate(input_fn=test_input_fn) print('Calibrated lattice results: {}'.format(results)) print('Calibrated lattice model exported to {}'.format( estimator.export_saved_model(estimator.model_dir, serving_input_fn))) # This is random lattice ensemble model with separate calibration: # model output is the average output of separately calibrated lattices. model_config = configs.CalibratedLatticeEnsembleConfig( feature_configs=feature_configs, num_lattices=6, lattice_rank=5, separate_calibrators=True, regularizer_configs=[ # Torsion regularizer applied to the lattice to make it more linear. configs.RegularizerConfig(name='torsion', l2=1e-4), # Globally defined calibration regularizer is applied to all features. configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ]) configs.apply_updates(model_config, config_updates) estimator = estimators.CannedClassifier( feature_columns=feature_columns, model_config=model_config, feature_analysis_input_fn=feature_analysis_input_fn, optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate)) estimator.train(input_fn=train_input_fn) results = estimator.evaluate(input_fn=test_input_fn) print('Random ensemble results: {}'.format(results)) print('Random ensemble model exported to {}'.format( estimator.export_saved_model(estimator.model_dir, serving_input_fn))) # This is Crystals ensemble model with separate calibration: model output is # the average output of separately calibrated lattices. # Crystals algorithm first trains a prefitting model and uses the interactions # between features to form the final lattice ensemble. model_config = configs.CalibratedLatticeEnsembleConfig( feature_configs=feature_configs, # Using Crystals algorithm. lattices='crystals', num_lattices=6, lattice_rank=5, separate_calibrators=True, regularizer_configs=[ # Torsion regularizer applied to the lattice to make it more linear. configs.RegularizerConfig(name='torsion', l2=1e-4), # Globally defined calibration regularizer is applied to all features. configs.RegularizerConfig(name='calib_hessian', l2=1e-4), ]) configs.apply_updates(model_config, config_updates) estimator = estimators.CannedClassifier( feature_columns=feature_columns, model_config=model_config, feature_analysis_input_fn=feature_analysis_input_fn, # prefitting_input_fn is required to train the prefitting model. prefitting_input_fn=prefitting_input_fn, optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate)) estimator.train(input_fn=train_input_fn) results = estimator.evaluate(input_fn=test_input_fn) print('Crystals ensemble results: {}'.format(results)) print('Crystals ensemble model exported to {}'.format( estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
print('A batch of targets:', label_batch) # We will use this batch to demonstrate several types of feature columns example_batch = next(iter(train_ds))[0] # A utility method to create a feature column # and to transform a batch of data def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) #Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset age = feature_column.numeric_column('Age') print("Demo AGE") demo(age) age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 75, 85, 90]) print('Demo AGE buckets') demo(age_buckets) gender = feature_column.categorical_column_with_vocabulary_list( 'Gender', ['Male', 'Female']) gender_one_hot = feature_column.indicator_column(gender) print('Gender one hot') demo(gender_one_hot)
#for feature_batch, label_batch in train_ds.take(1): #print('Every feature:', list(feature_batch.keys())) #print('A batch of YTG:', feature_batch['YardsToGo']) #print('A batch of targets:', label_batch) #print(tf.shape(label_batch)) example_batch = next(iter(train_ds))[0] def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) feature_columns = [] for header in ['Down', 'ScoreDifferential', 'CurrentQuarter', 'OffenseSPRank', 'RushRank', 'PassRank', 'OpponentPlay', 'YardsToGo', 'YardsToGoal', 'Location']: feature_columns.append(feature_column.numeric_column(header)) Down = feature_column.numeric_column("Down") Quarter = feature_column.numeric_column("CurrentQuarter") Score = feature_column.numeric_column("ScoreDifferential") SP = feature_column.numeric_column("OffenseSPRank") RushR = feature_column.numeric_column("RushRank") PassR = feature_column.numeric_column("PassRank") YTGo = feature_column.numeric_column("YardsToGo") YTGoal = feature_column.numeric_column("YardsToGoal") Play = feature_column.categorical_column_with_vocabulary_list('OpponentPlay', ['Pass', 'Run']) Location = feature_column.categorical_column_with_vocabulary_list('Location', ['Home', 'Neutral', 'Away']) Down_buckets = feature_column.bucketized_column(Down, boundaries=[4]) feature_columns.append(Down_buckets) Quarter_buckets = feature_column.bucketized_column(Quarter, boundaries=[2, 3, 4])
def create_feature_columns(): c2id = fc.categorical_column_with_hash_bucket("cate2Id", 5000, dtype=tf.int64) modified_time = fc.numeric_column("modified_time", default_value=0.0) modified_time_sqrt = fc.numeric_column("modified_time_sqrt", default_value=0.0) modified_time_square = fc.numeric_column("modified_time_square", default_value=0.0) props_sex = fc.indicator_column( fc.categorical_column_with_vocabulary_list("props_sex", ["男", "女", "通用", "情侣"], default_value=0)) brand_grade = fc.indicator_column( fc.categorical_column_with_vocabulary_list( "brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0)) shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0) shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0) ipv_ntile = fc.bucketized_column( fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80]) pay_ntile = fc.bucketized_column( fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80]) price = fc.numeric_column("price_norm", default_value=0.0) ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0) cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0) uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0) ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0) cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0) uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0) ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0) cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0) uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0) ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0) cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0) uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0) pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0) pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0) pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0) pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0) cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0) cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0) brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0) slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0) slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0) slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0) slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0) slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d", default_value=0.0) slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w", default_value=0.0) slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w", default_value=0.0) slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m", default_value=0.0) weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0) cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv", default_value=0.0) cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv", default_value=0.0) slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0) brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0) cms_scale = fc.numeric_column("cms_scale", default_value=0.0) cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column( fc.categorical_column_with_identity("matchType", 9, default_value=0)) position = fc.bucketized_column(fc.numeric_column("position", dtype=tf.int64, default_value=301), boundaries=[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 80, 100, 150, 200, 300 ]) triggerNum = fc.indicator_column( fc.categorical_column_with_identity("triggerNum", 41, default_value=40)) triggerRank = fc.indicator_column( fc.categorical_column_with_identity("triggerRank", 41, default_value=40)) sceneType = fc.indicator_column( fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column( fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20) phoneResolutionId = fc.categorical_column_with_hash_bucket( "phoneResolution", 500) phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column( fc.categorical_column_with_vocabulary_list("tab", [ "ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi" ], default_value=0)) c2id_embed = fc.shared_embedding_columns( [c2id], 16, shared_embedding_collection_name="c2id") feature_columns = [ matchScore, matchType, position, triggerNum, triggerRank, sceneType, hour, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer, price, props_sex, brand_grade, modified_time, modified_time_sqrt, modified_time_square, shipment_rate, shipping_rate, ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m, ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m, pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty, slr_pay_qty_1d, slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m, slr_brd_pay_qty_1d, slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m, weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv, brd_weighted_ipv, cms_scale, cms_scale_sqrt ] feature_columns += c2id_embed feature_columns += phoneResolution feature_columns += phoneBrand return feature_columns
import pandas as pd import numpy as np import tensorflow as tf from tensorflow import feature_column as fc from tensorflow.python.lib.io import file_io from tensorflow.python.ops import init_ops from .eval_metrics import AverageNClass, HitAtOne N_CLASS = 3862 BATCH_SIZE = 1024 VOCAB_FILE = "data/vocabulary.csv" # Exclude audio feature since we didn't implement audio feature extraction. # Even if the model can be trained on audio feature, # they won't be available for inference on new video. FEAT_COL_VIDEO = [ fc.numeric_column(key="mean_rgb", shape=(1024, ), dtype=tf.float32), #fc.numeric_column(key="mean_audio", shape=(128,), dtype=tf.float32), fc.indicator_column( fc.categorical_column_with_identity(key="labels", num_buckets=N_CLASS)) ] FEAT_X = ["mean_rgb"] FEAT_SPEC_VIDEO = fc.make_parse_example_spec(FEAT_COL_VIDEO) MULTI_HOT_ENCODER = tf.keras.layers.DenseFeatures(FEAT_COL_VIDEO[-1]) # If we'd like to use a custom serving input function, we need to use the estimator API. # There is no document on how a keras model can use a custom serving input function. KERAS_TO_ESTIMATOR = True def calc_class_weight(infile, scale=1): """Calculate class weight to re-balance label distribution. The class weight for class i (w_i) is determined by:
def _sparse_fc(sparse_feature: SparseFeature): return fc.numeric_column(key=sparse_feature.feature_name, dtype=tf.int64)
def create_feature_columns(self): _NUMERIC_COLUMNS = ['posted_time', 'owner_influence', 'poster_influence', 'participant1_influence', 'participant2_influence', 'participant3_influence', 'participant4_influence', 'participant5_influence'] _BINARY_COLUMNS = ["is_mentions_me", "is_mentions_connections", "is_commented_by_me", "is_commented_by_connections", "is_liked_by_me", "is_liked_by_connections", "poster_is_employee","poster_is_in_connections", "participant1_is_employee", "participant1_is_in_connections", "participant2_is_employee", "participant2_is_in_connections", "participant3_is_employee", "participant3_is_in_connections", "participant4_is_employee", "participant4_is_in_connections", "participant5_is_employee", "participant5_is_in_connections"] _GENDER_COLUMNS = ["poster_gender", "participant1_gender", "participant2_gender", "participant3_gender", "participant4_gender", "participant5_gender"] self.real = { colname: feature_column.numeric_column(colname) \ for colname in _NUMERIC_COLUMNS } self.sparse = dict() app_type = feature_column.categorical_column_with_vocabulary_list( 'app_type', self.df.app_type.unique()) app_type_1hot = feature_column.indicator_column(app_type) self.sparse["app_type"] = app_type_1hot owner_type = feature_column.categorical_column_with_vocabulary_list( 'owner_type', self.df.owner_type.unique()) owner_type_1hot = feature_column.indicator_column(owner_type) self.sparse["owner_type"] = owner_type_1hot poster_focus = feature_column.categorical_column_with_vocabulary_list( 'poster_focus', ['engineering', 'sales', 'marketing', 'management', 'financial', 'other']) poster_focus_1hot = feature_column.indicator_column(poster_focus) self.sparse["poster_focus"] = poster_focus_1hot for col in _GENDER_COLUMNS: feature = feature_column.categorical_column_with_vocabulary_list(col, self.df[col].unique()) feature_1hot = feature_column.indicator_column(feature) self.sparse[col] = feature_1hot participant1_action = participant_action("participant1_action") participant2_action = participant_action("participant2_action") participant3_action = participant_action("participant3_action") participant4_action = participant_action("participant4_action") participant5_action = participant_action("participant5_action") participant1_focus = participant_focus("participant1_focus") participant2_focus = participant_focus("participant2_focus") participant3_focus = participant_focus("participant3_focus") participant4_focus = participant_focus("participant4_focus") participant5_focus = participant_focus("participant5_focus") self.sparse["participant2_action"] = feature_column.indicator_column(participant2_action) self.sparse["participant3_action"] = feature_column.indicator_column(participant3_action) self.sparse["participant1_action"] = feature_column.indicator_column(participant1_action) self.sparse["participant4_action"] = feature_column.indicator_column(participant4_action) self.sparse["participant5_action"] = feature_column.indicator_column(participant5_action) self.sparse["participant1_focus"] = feature_column.indicator_column(participant1_focus) self.sparse["participant2_focus"] = feature_column.indicator_column(participant2_focus) self.sparse["participant3_focus"] = feature_column.indicator_column(participant3_focus) self.sparse["participant4_focus"] = feature_column.indicator_column(participant4_focus) self.sparse["participant5_focus"] = feature_column.indicator_column(participant5_focus) self.inputs = { colname: tf.keras.layers.Input(name=colname, shape=(), dtype='float32') \ for colname in self.real.keys() } self.inputs.update({ colname: tf.keras.layers.Input(name=colname, shape=(), dtype='string') \ for colname in self.sparse.keys() }) for col in _BINARY_COLUMNS: feature = feature_column.categorical_column_with_vocabulary_list( col, self.df[col].unique()) feature_1hot = feature_column.indicator_column(feature) self.sparse[col] = feature_1hot likes_num = feature_column.numeric_column("number_of_likes") likes_num_buckets = feature_column.bucketized_column(likes_num, boundaries=[2, 5, 10, 20, 50, 100]) self.sparse["number_of_likes"] = likes_num_buckets comments_num = feature_column.numeric_column("number_of_comments") comments_num_buckets = feature_column.bucketized_column(comments_num, boundaries=[1, 2, 5, 10, 20, 50, 100]) self.sparse["number_of_comments"] = comments_num_buckets age_boundaries = [30 * _ONE_MIN, _ONE_HOUR, 2 * _ONE_HOUR, 3 * _ONE_HOUR, 4 * _ONE_HOUR, 24 * _ONE_HOUR] age = feature_column.numeric_column("effective_age_long") age_buckets = feature_column.bucketized_column(age, boundaries=age_boundaries) self.sparse["effective_age_long"] = age_buckets daytime = feature_column.numeric_column("daytime") daytime_buckets = feature_column.bucketized_column(daytime, boundaries=[8, 12, 16, 24]) self.sparse["daytime"] = daytime_buckets weekday = feature_column.categorical_column_with_vocabulary_list( 'weekday', self.df.weekday.unique()) weekday_1hot = feature_column.indicator_column(weekday) self.sparse["weekday"] = weekday_1hot self.inputs.update({ colname: tf.keras.layers.Input(name=colname, shape=(), dtype='int64') \ for colname in _BINARY_COLUMNS + ["number_of_likes", "number_of_comments", "effective_age_long", "daytime", "weekday"] }) # hash_bucket_size=30 because there are 7 possible values in weekday and 4 in daytime, all possible combinations will be 28 ~ 30 weekday_x_daytime = feature_column.crossed_column([weekday, daytime_buckets], hash_bucket_size=30) self.sparse["weekday_x_daytime"] = feature_column.indicator_column(weekday_x_daytime) # 6 bins in likes and 7 in comments likes_x_comments = feature_column.crossed_column([likes_num_buckets, comments_num_buckets], hash_bucket_size=45) self.sparse["likes_x_comments"] = feature_column.indicator_column(likes_x_comments) # 6 bins in likes, 3 in action and 7 in focus, 6*3*7=126~130 likes_x_participant1_focus_n_action = feature_column.crossed_column( [likes_num_buckets, participant1_action, participant1_focus], hash_bucket_size=130) self.sparse["likes_x_participant1_focus_n_action"] = feature_column.indicator_column( likes_x_participant1_focus_n_action) likes_x_participant2_focus_n_action = feature_column.crossed_column( [likes_num_buckets, participant2_action, participant2_focus], hash_bucket_size=130) self.sparse["likes_x_participant2_focus_n_action"] = feature_column.indicator_column( likes_x_participant2_focus_n_action) likes_x_participant3_focus_n_action = feature_column.crossed_column( [likes_num_buckets, participant3_action, participant3_focus], hash_bucket_size=130) self.sparse["likes_x_participant3_focus_n_action"] = feature_column.indicator_column( likes_x_participant3_focus_n_action) likes_x_participant4_focus_n_action = feature_column.crossed_column( [likes_num_buckets, participant4_action, participant4_focus], hash_bucket_size=130) self.sparse["likes_x_participant4_focus_n_action"] = feature_column.indicator_column( likes_x_participant4_focus_n_action) likes_x_participant5_focus_n_action = feature_column.crossed_column( [likes_num_buckets, participant5_action, participant5_focus], hash_bucket_size=130) self.sparse["likes_x_participant5_focus_n_action"] = feature_column.indicator_column( likes_x_participant5_focus_n_action)
def _dense_fc(dense_feature: DenseFeature): return fc.numeric_column(key=dense_feature.feature_name)
def create_feature_columns(df, which_lanes): transformed_df = df.copy() all_lanes = which_lanes.copy() champion_list = champ_info.id.tolist() feature_columns = [] good_numeric = list(map(lambda x: "{}_good_numeric".format(x), all_lanes)) for header in good_numeric: feature_columns.append( feature_column.numeric_column(header, shape=(22, ))) bad_numeric = list(map(lambda x: "{}_bad_numeric".format(x), all_lanes)) for header in bad_numeric: feature_columns.append( feature_column.numeric_column(header, shape=(3, ))) good_categ = list(map(lambda x: "{}_good_categ".format(x), all_lanes)) for header in good_categ: feature_columns.append( feature_column.numeric_column(header, shape=(4, ))) bad_categ = list(map(lambda x: "{}_bad_categ".format(x), all_lanes)) for header in bad_categ: feature_columns.append( feature_column.numeric_column(header, shape=(2, ))) TOP100 = feature_column.categorical_column_with_vocabulary_list( "TOP100_champ", champion_list) TOP200 = feature_column.categorical_column_with_vocabulary_list( "TOP200_champ", champion_list) TOP_crossed = feature_column.crossed_column([TOP100, TOP200], hash_bucket_size=1000) TOP_crossed = feature_column.indicator_column(TOP_crossed) feature_columns.append(TOP_crossed) JUNGLE100 = feature_column.categorical_column_with_vocabulary_list( "JUNGLE100_champ", champion_list) JUNGLE200 = feature_column.categorical_column_with_vocabulary_list( "JUNGLE200_champ", champion_list) JUNGLE_crossed = feature_column.crossed_column([JUNGLE100, JUNGLE200], hash_bucket_size=1000) JUNGLE_crossed = feature_column.indicator_column(JUNGLE_crossed) feature_columns.append(JUNGLE_crossed) MID100 = feature_column.categorical_column_with_vocabulary_list( "MID100_champ", champion_list) MID200 = feature_column.categorical_column_with_vocabulary_list( "MID200_champ", champion_list) MID_crossed = feature_column.crossed_column([MID100, MID200], hash_bucket_size=1000) MID_crossed = feature_column.indicator_column(MID_crossed) feature_columns.append(MID_crossed) ADC100 = feature_column.categorical_column_with_vocabulary_list( "ADC100_champ", champion_list) ADC200 = feature_column.categorical_column_with_vocabulary_list( "ADC200_champ", champion_list) ADC_crossed = feature_column.crossed_column([ADC100, ADC200], hash_bucket_size=1000) ADC_crossed = feature_column.indicator_column(ADC_crossed) feature_columns.append(ADC_crossed) SUPPORT100 = feature_column.categorical_column_with_vocabulary_list( "SUPPORT100_champ", champion_list) SUPPORT200 = feature_column.categorical_column_with_vocabulary_list( "SUPPORT200_champ", champion_list) SUPPORT_crossed = feature_column.crossed_column([SUPPORT100, SUPPORT200], hash_bucket_size=1000) SUPPORT_crossed = feature_column.indicator_column(SUPPORT_crossed) feature_columns.append(SUPPORT_crossed) return feature_columns
#Configure the train_inpf to iterate over the data twice: import functools train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64) test_inpf = functools.partial(census_dataset.input_fn, test_file, num_epochs=1, shuffle=False, batch_size=64) ##Selecting and Engineering Features for the Model ### Base Feature Columns ####Numericals age = fc.numeric_column('age') #Train and evaluate a model using only the age feature: classifier = tf.estimator.LinearClassifier(feature_columns=[age]) classifier.train(train_inpf) result = classifier.evaluate(test_inpf) clear_output() # used for display in notebook print(result) #We define a NumericColumn for each continuous feature column that we want to use in the model: education_num = tf.feature_column.numeric_column('education_num') capital_gain = tf.feature_column.numeric_column('capital_gain') capital_loss = tf.feature_column.numeric_column('capital_loss') hours_per_week = tf.feature_column.numeric_column('hours_per_week')
ds = ds.batch(batch_size) return ds example_batch = next(iter(df_to_dataset(dataframe)))[0] # A utility method to create a feature column # and to transform a batch of data def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) print('Pre processing age==========================>') age = feature_column.numeric_column("age") demo(age) age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) #demo(age_buckets) #type(age_buckets) #dataframe.pop('age') #age_buckets_series = pd.Series(age_buckets) print('age buckets series====================>') demo(age_buckets) age_buckets_df = pd.DataFrame({'age': age_buckets}) print('age buckets series====================>') #dataframe.join(age_buckets_column)
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item feature pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column( fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column( fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column( fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column( fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column( fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column( fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrand = fc.indicator_column( fc.categorical_column_with_hash_bucket("phoneBrand", 1000)) phoneResolution = fc.indicator_column( fc.categorical_column_with_hash_bucket("phoneResolution", 500)) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column( fc.categorical_column_with_vocabulary_list("tab", [ "ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi" ], default_value=0)) pid_embed = fc.shared_embedding_columns( [pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid") bid_embed = fc.shared_embedding_columns( [bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid") cid_embed = fc.shared_embedding_columns( [cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid") c1id_embed = fc.shared_embedding_columns( [c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id") sid_embed = fc.shared_embedding_columns( [sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid") global my_feature_columns my_feature_columns = [ matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer ] my_feature_columns += pid_embed my_feature_columns += sid_embed my_feature_columns += bid_embed my_feature_columns += cid_embed my_feature_columns += c1id_embed print("feature columns:", my_feature_columns) return my_feature_columns
print('Every feature:', list(feature_batch.keys())) print('A batch of ages:', feature_batch['Age']) print('A batch of targets:', label_batch ) # We will use this batch to demonstrate several types of feature columns example_batch = next(iter(train_ds))[0] # A utility method to create a feature column # and to transform a batch of data def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy()) #Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset age = feature_column.numeric_column('Age') print("Demo AGE") demo(age) ronaldo = feature_column.numeric_column('Age') print('Demo Ronaldo') demo(ronaldo) age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 75, 85, 90]) print('Demo AGE buckets') demo(age_buckets) gender = feature_column.categorical_column_with_vocabulary_list( 'Gender', ['Male', 'Female']) gender_one_hot = feature_column.indicator_column(gender)
def official_census_feature_columns_config_demo(): # categorical_column gender = fc.categorical_column_with_vocabulary_list( 'gender', ['Female', 'Male']) education = fc.categorical_column_with_vocabulary_list( 'education', [ 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th' ]) marital_status = fc.categorical_column_with_vocabulary_list( 'marital_status', [ 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed' ]) relationship = fc.categorical_column_with_vocabulary_list( 'relationship', [ 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative' ]) workclass = fc.categorical_column_with_vocabulary_list( 'workclass', [ 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked' ]) # To show an example of hashing: native_country = fc.categorical_column_with_hash_bucket( 'native_country', hash_bucket_size=1000) occupation = fc.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000) # Continuous feature columns. age = fc.numeric_column('age') education_num = fc.numeric_column('education_num') capital_gain = fc.numeric_column('capital_gain') capital_loss = fc.numeric_column('capital_loss') hours_per_week = fc.numeric_column('hours_per_week') # bucketized transformations. age_buckets = fc.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) # Wide columns and deep columns. base_columns = [ gender, education, marital_status, relationship, workclass, occupation, native_country, age_buckets ] crossed_columns = [ fc.crossed_column(['education', 'occupation'], hash_bucket_size=1000), fc.crossed_column([age_buckets, 'education', 'occupation'], hash_bucket_size=1000), fc.crossed_column(['native_country', 'occupation'], hash_bucket_size=1000) ] feature_columns = [ fc.indicator_column(workclass), fc.indicator_column(education), fc.indicator_column(gender), fc.indicator_column(relationship), fc.embedding_column(native_country, dimension=32), fc.embedding_column(occupation, dimension=32), age, education_num, capital_gain, capital_loss, hours_per_week, ] return feature_columns, base_columns, crossed_columns