def make_columns(): user_id = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'user_id', vocabulary_file='user_id', dtype=tf.string, num_oov_buckets=4), dimension=6) partner_ids = fc.categorical_column_with_vocabulary_file( 'reserve_partner_car_type_id', vocabulary_file='partner_car_type_id', dtype=tf.string, num_oov_buckets=1) partner_ids_embedding = fc.embedding_column(partner_ids, dimension=3) # partner_ids_embedding = fc.indicator_column(partner_ids) dayofweek = fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'dayofweek', [str(d) for d in range(0, 8)], dtype=tf.string, num_oov_buckets=1), dimension=3) timeSlice = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'timeSlice', vocabulary_file='timeSlice', dtype=tf.string, num_oov_buckets=1), dimension=3) sHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'sHexID', vocabulary_file='sHexID', num_oov_buckets=1), dimension=6) eHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'eHexID', vocabulary_file='eHexID', num_oov_buckets=1), dimension=6) order_columns = [ fc.numeric_column('dist') ] user_columns = [fc.numeric_column(c) for c in user_null_columns + user_float32_columns] spacetime_columns = [fc.numeric_column(c) for c in int64_columns + float32_columns] embedding_columns = [user_id, partner_ids_embedding, dayofweek, timeSlice, sHexID] # embedding_columns = [dayofweek] # return embedding_columns + order_columns, embedding_columns + order_columns + spacetime_columns return embedding_columns, order_columns, spacetime_columns, user_columns
def _base(): education_num=fc.numeric_column('education_num') capital_gain=fc.numeric_column('capital_gain') capital_loss=fc.numeric_column('capital_loss') hours_per_week=fc.numeric_column('hours_per_week') #categorical,embedding_column relationship=fc.categorical_column_with_vocabulary_file('relationship',vocabulary_file='data/relationship') relationship=fc.indicator_column(relationship) education=fc.categorical_column_with_vocabulary_file('education',vocabulary_file='data/education') education=fc.indicator_column(education) race=fc.categorical_column_with_vocabulary_file('race',vocabulary_file='data/race') race=fc.indicator_column(race) occupation=fc.indicator_column(fc.categorical_column_with_hash_bucket('occupation',20)) return [education_num,capital_gain,capital_loss,hours_per_week,relationship,education,race,occupation]
def categorical_column(key, vocabulary_size=None, vocabulary_list=None, vocabulary_file=None, num_oov_buckets=0): if vocabulary_size: categorical_col = feature_column.categorical_column_with_identity(key, vocabulary_size) return categorical_col elif vocabulary_list: assert isinstance(vocabulary_list[0], six.string_types), "Vocabulary must be sequence of string" categorical_col = feature_column.categorical_column_with_vocabulary_list(key, vocabulary_list, num_oov_buckets) return categorical_col elif vocabulary_file: categorical_col = feature_column.categorical_column_with_vocabulary_file(key, vocabulary_file, num_oov_buckets) return categorical_col
def make_columns_with_normalizer_with_file(): with open('summary.json') as fp: import pandas as pd summary = pd.DataFrame(json.load(fp)).T # categorical_column_with_vocabulary_list user_id = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'user_id', vocabulary_file='user_id', dtype=tf.string, num_oov_buckets=4), dimension=6) partner_ids = fc.categorical_column_with_vocabulary_file( 'reserve_partner_car_type_id', vocabulary_file='partner_car_type_id', dtype=tf.string, num_oov_buckets=1) partner_ids_embedding = fc.embedding_column(partner_ids, dimension=3) # partner_ids_embedding = fc.indicator_column(partner_ids) dayofweek = fc.embedding_column(fc.categorical_column_with_vocabulary_list( 'dayofweek', [str(d) for d in range(0, 8)], dtype=tf.string, num_oov_buckets=1), dimension=3) timeSlice = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'timeSlice', vocabulary_file='timeSlice', dtype=tf.string, num_oov_buckets=1), dimension=3) sHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'sHexID', vocabulary_file='sHexID', num_oov_buckets=1), dimension=6) eHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file( 'eHexID', vocabulary_file='eHexID', num_oov_buckets=1), dimension=6) order_columns = [ fc.numeric_column('dist', normalizer_fn=lambda v: normalizer_fn(v, float(summary['dist']['min']), float(summary['dist']['max']))) ] user_columns = [fc.numeric_column(c) for c in user_null_columns + user_float32_columns] # TODO: summary info spacetime_columns = [fc.numeric_column(c) for c in int64_columns] spacetime_columns += [fc.numeric_column(c, normalizer_fn=lambda v: normalizer_fn(v, float(summary[c]['min']), float(summary[c]['max']))) for c in float32_columns] embedding_columns = [user_id, partner_ids_embedding, dayofweek, timeSlice, sHexID] # embedding_columns = [dayofweek] # return embedding_columns + order_columns, embedding_columns + order_columns + spacetime_columns return embedding_columns, order_columns, spacetime_columns, user_columns
def create_tf_categorical_feature_cols(categorical_col_list, vocab_dir='./diabetes_vocab/'): ''' categorical_col_list: list, categorical field list that will be transformed with TF feature column vocab_dir: string, the path where the vocabulary text files are located return: output_tf_list: list of TF feature columns ''' output_tf_list = [] for c in categorical_col_list: vocab_file_path = os.path.join(vocab_dir, c + "_vocab.txt") ''' Which TF function allows you to read from a text file and create a categorical feature You can use a pattern like this below... tf_categorical_feature_column = tf.feature_column....... ''' cat = feature_column.categorical_column_with_vocabulary_file( c, vocab_file_path) col = feature_column.indicator_column(cat) output_tf_list.append(col) return output_tf_list
feature_columns.append(resStatus_one_hot) arrivalMonth = feature_column.categorical_column_with_vocabulary_list( 'ArrivalDateMonth', [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]) arrivalMonth_one_hot = feature_column.indicator_column(arrivalMonth) feature_columns.append(arrivalMonth_one_hot) countryNames = feature_column.categorical_column_with_vocabulary_file( 'Country', sys.argv[3], vocabulary_size=None, dtype=tf.dtypes.string, default_value=None, num_oov_buckets=0) countryNames_one_hot = feature_column.indicator_column(countryNames) feature_columns.append(countryNames_one_hot) for header in [ 'LeadTime', 'ArrivalDateWeekNumber', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'BookingChanges', 'DaysInWaitingList', 'RequiredCarParkingSpaces', 'TotalOfSpecialRequests', 'ADR', 'ArrivalDateDayOfMonth', 'ArrivalDateYear', 'Agent', 'Company'
#-*- coding:utf-8 -*- #定义feature_columns import tensorflow as tf from tensorflow import feature_column as fc #-----------------------用户特征列----------------------------- province = fc.indicator_column( fc.categorical_column_with_vocabulary_file('province', 'resource/province')) region = fc.indicator_column( fc.categorical_column_with_vocabulary_list( 'region', ['东北', '华中', '华东', '华北', '西北', '华南', '西南'])) city = fc.indicator_column( fc.categorical_column_with_vocabulary_file('city', 'resource/city')) city_level = fc.indicator_column( fc.categorical_column_with_vocabulary_list( 'city_level', ['一线城市', '新一线城市', '二线城市', '三线城市', '四线城市', '五线城市'])) browser = fc.indicator_column( fc.categorical_column_with_vocabulary_list('browser', [0, 1])) os = fc.indicator_column( fc.categorical_column_with_vocabulary_list( 'os', ['Android', 'android', 'devtools', 'unknown', 'iPhone', 'ios'])) ipv_7d_type = fc.indicator_column( fc.categorical_column_with_vocabulary_list('ipv_7d_type', [1, 2, 3, 4])) ipv_15d_type = fc.indicator_column( fc.categorical_column_with_vocabulary_list('ipv_15d_type', [1, 2, 3, 4])) ipv_30d_type = fc.indicator_column( fc.categorical_column_with_vocabulary_list('ipv_30d_type', [1, 2, 3, 4])) ipv_60d_type = fc.indicator_column( fc.categorical_column_with_vocabulary_list('ipv_60d_type', [1, 2, 3, 4]))
feature_columns.append(resStatus_one_hot) arrivalMonth = feature_column.categorical_column_with_vocabulary_list( 'ArrivalDateMonth', [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]) arrivalMonth_one_hot = feature_column.indicator_column(arrivalMonth) feature_columns.append(arrivalMonth_one_hot) countryNames = feature_column.categorical_column_with_vocabulary_file( 'Country', 'E:/stech/Documents/Uni/4thYear/Honours/CountryNames.csv', vocabulary_size=None, dtype=tf.dtypes.string, default_value=None, num_oov_buckets=0) countryNames_one_hot = feature_column.indicator_column(countryNames) feature_columns.append(countryNames_one_hot) for header in [ 'LeadTime', 'ArrivalDateWeekNumber', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'BookingChanges', 'DaysInWaitingList', 'RequiredCarParkingSpaces', 'TotalOfSpecialRequests', 'ADR', 'ArrivalDateDayOfMonth', 'ArrivalDateYear', 'Agent', 'Company'