def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets): # Pass-through columns transformed = inputs.copy() del transformed['pickup_datetime'] feature_columns = { colname: fc.numeric_column(colname) for colname in NUMERIC_COLS } # Scaling longitude from range [-70, -78] to [0, 1] for lon_col in ['pickup_longitude', 'dropoff_longitude']: transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0, name='scale_{}'.format(lon_col))( inputs[lon_col]) # Scaling latitude from range [37, 45] to [0, 1] for lat_col in ['pickup_latitude', 'dropoff_latitude']: transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0, name='scale_{}'.format(lat_col))( inputs[lat_col]) # Adding Euclidean dist (no need to be accurate: NN will calibrate it) transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([ inputs['pickup_longitude'], inputs['pickup_latitude'], inputs['dropoff_longitude'], inputs['dropoff_latitude'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # hour of day from timestamp of form '2010-02-08 09:17:00+00:00' transformed['hourofday'] = layers.Lambda( lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2), out_type=tf.dtypes.int32), name='hourofday')(inputs['pickup_datetime']) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', num_buckets=24)) latbuckets = np.linspace(0, 1, nbuckets).tolist() lonbuckets = np.linspace(0, 1, nbuckets).tolist() b_plat = fc.bucketized_column(feature_columns['pickup_latitude'], latbuckets) b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'], latbuckets) b_plon = fc.bucketized_column(feature_columns['pickup_longitude'], lonbuckets) b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'], lonbuckets) ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets) dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets) pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4) feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100) return transformed, feature_columns
def build_features(statistics): pu_location_id = fc.categorical_column_with_identity(key='PULocationID', num_buckets=265) do_location_id = fc.categorical_column_with_identity(key='DOLocationID', num_buckets=265) day_of_week = fc.categorical_column_with_identity(key='day_of_week', num_buckets=7) weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2) speed_buckets = fc.bucketized_column( fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70]) distance_buckets = fc.bucketized_column( fc.numeric_column('trip_distance'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) duration_buckets = fc.bucketized_column( fc.numeric_column('duration'), boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]) fare_buckets = fc.bucketized_column( fc.numeric_column('fare_amount'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) passenger_buckets = fc.bucketized_column( fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9]) location = fc.crossed_column([pu_location_id, do_location_id], hash_bucket_size=1000) cross_all = fc.crossed_column([ location, speed_buckets, distance_buckets, duration_buckets, fare_buckets, passenger_buckets ], hash_bucket_size=1000) categorical_columns = [ fc.embedding_column(pu_location_id, dimension=32), fc.embedding_column(do_location_id, dimension=32), fc.indicator_column(day_of_week), fc.indicator_column(weekend) ] numeric_columns = [ custom_numeric_column('passenger_count', statistics), custom_numeric_column('trip_distance', statistics), custom_numeric_column('fare_amount', statistics), custom_numeric_column('extra', statistics), custom_numeric_column('mta_tax', statistics), custom_numeric_column('tolls_amount', statistics), custom_numeric_column('improvement_surcharge', statistics), custom_numeric_column('duration', statistics), custom_numeric_column('speed', statistics) ] dnn_feature_columns = numeric_columns + categorical_columns linear_feature_columns = [location, cross_all] return dnn_feature_columns, linear_feature_columns
def test_crossed_column(): """ crossed column测试 """ #源数据 featrues = { 'price': [['A'], ['B'], ['C']], # 0,1,2 'color': [['R'], ['G'], ['B']] # 0,1,2 } # categorical_column price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) #crossed_column 产生稀疏表示 p_x_c = feature_column.crossed_column([price, color], 16) # 稠密表示 p_x_c_identy = feature_column.indicator_column(p_x_c) # crossed_column 连接 源数据 p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([p_x_c_identy_dense_tensor]))
def _make_crossed(self): """Makes crossed features for both Wide or Deep network. Returns: Tuple (crossed columns for Wide, its dimension) """ # Crossed columns f_crossed_for_wide = [] f_crossed_for_deep = [] for to_cross in self.CROSSED: keys = [] bck_size = 1 for (key, bck, bnd) in to_cross: keys.append(self._prepare_for_crossing(key, bck, bnd)) bck_size *= bck # We can't go crazy on the dim for crossed_column so use a min # **0.25 is a rule of thumb for bucket size vs dimension t_crossed = tfc.crossed_column(keys, min(bck_size, 10000)) t_dimension = int(bck_size**0.25) f_crossed_for_wide.append(t_crossed) f_crossed_for_deep.append( tfc.embedding_column(t_crossed, t_dimension)) return f_crossed_for_wide, f_crossed_for_deep
def _build_census_wide_columns(numeric_range=None): base_columns, cross_columns = [], [] for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']: base_columns.append( fc.indicator_column( fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000))) for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']: base_columns.append( fc.bucketized_column(fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000)))) for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']: cross_columns.append( fc.indicator_column( fc.crossed_column([col[0], col[1]], hash_bucket_size=10000))) feature_columns = base_columns + cross_columns feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def data_preprocessing(self): """ batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다. train_ds = self.df_to_dataset(self.train, batch_size=batch_size) val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('전체 특성:', list(feature_batch.keys())) print('나이 특성의 배치:', feature_batch['age']) print('타깃의 배치:', label_batch) # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다. self.example_batch = next(iter(train_ds))[0] age = feature_column.numeric_column("age") self.demo(age) """ feature_columns = [] # 수치형 열 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 버킷형 열 age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 범주형 열 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 임베딩 열 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 교차 특성 열 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) self.feature_layer = layers.DenseFeatures(feature_columns) batch_size = 32 self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size) self.val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) self.test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)
def create_feature_columns(dataset, embed_size=32, hash_size=10000): n_users = dataset.user.nunique() n_items = dataset.item.nunique() genre_list = dataset.genre1.unique() users = fc.categorical_column_with_vocabulary_list("user", np.arange(n_users), default_value=-1, dtype=tf.int64) items = fc.categorical_column_with_vocabulary_list("item", np.arange(n_items), default_value=-1, dtype=tf.int64) gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"]) age = fc.categorical_column_with_vocabulary_list( "age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64) occupation = fc.categorical_column_with_vocabulary_list("occupation", np.arange(21), dtype=tf.int64) genre1 = fc.categorical_column_with_vocabulary_list("genre1", genre_list) genre2 = fc.categorical_column_with_vocabulary_list("genre2", genre_list) genre3 = fc.categorical_column_with_vocabulary_list("genre3", genre_list) wide_cols = [ users, items, gender, age, occupation, genre1, genre2, genre3, fc.crossed_column([gender, age, occupation], hash_bucket_size=hash_size), fc.crossed_column([age, genre1], hash_bucket_size=hash_size) ] embed_cols = [users, items, age, occupation] deep_cols = list() for col in embed_cols: deep_cols.append(fc.embedding_column(col, embed_size)) shared_embed_cols = [genre1, genre2, genre3] deep_cols.extend(fc.shared_embedding_columns(shared_embed_cols, embed_size)) deep_cols.append(fc.indicator_column(gender)) label = fc.numeric_column("label", default_value=0.0, dtype=tf.float32) feat_columns = [label] feat_columns += wide_cols feat_columns += deep_cols feat_spec = fc.make_parse_example_spec(feat_columns) return wide_cols, deep_cols, feat_spec
def create_feature_layer(df): week = feature_column.numeric_column("Week") boundaries = [] for i in range(1, 53): boundaries.append(i) week = feature_column.bucketized_column(week, boundaries=boundaries) day = feature_column.numeric_column("Day") boundaries = [] for i in range(1, 8): boundaries.append(i) day = feature_column.bucketized_column(day, boundaries=boundaries) year = feature_column.numeric_column("Year") boundaries = [] for i in range(2013, 2017): boundaries.append(i) year = feature_column.bucketized_column(year, boundaries=boundaries) hour = feature_column.numeric_column("std_hour") boundaries = [] for i in range(0, 24): boundaries.append(i) hour = feature_column.bucketized_column(hour, boundaries=boundaries) arrival = feature_column.categorical_column_with_vocabulary_list( "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist()) airline = feature_column.categorical_column_with_vocabulary_list( "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist()) flight_no = feature_column.categorical_column_with_vocabulary_list( "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist()) arrival_one_hot = feature_column.indicator_column(arrival) airline_one_hot = feature_column.indicator_column(airline) flight_no_one_hot = feature_column.indicator_column(flight_no) arrival_length = len(pd.Series.unique(df.Arrival).tolist()) arrival_and_week = feature_column.crossed_column( [arrival, week], hash_bucket_size=(arrival_length * 52)) arrival_and_week = feature_column.indicator_column(arrival_and_week) airline_length = len(pd.Series.unique(df.Airline).tolist()) year_and_airline = feature_column.crossed_column( [year, airline], hash_bucket_size=(airline_length * 4)) year_and_airline = feature_column.indicator_column(year_and_airline) feature_columns = [] feature_columns = feature_columns + [ week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour, arrival_and_week, year, year_and_airline ] feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def crossed_feature_columns(self, columns_crossed, nameOfLayer, bucket_size=10): crossed_feature = feature_column.crossed_column( columns_crossed, hash_bucket_size=bucket_size) crossed_feature = feature_column.indicator_column(crossed_feature) self.sparse_columns[nameOfLayer] = crossed_feature return crossed_feature
def get_feature_columns(dataframe): """Creates feature columns from pd.DataFrame.""" feature_columns = [] feature_layer_inputs = {} # numeric cols for col_name in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(col_name)) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name, dtype=tf.string) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=16) feature_columns.append(breed1_embedding) feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ), name='Breed1', dtype=tf.string) # crossed columns animal_type = feature_column.categorical_column_with_vocabulary_list( 'Type', ['Cat', 'Dog']) feature_columns.append(feature_column.indicator_column(animal_type)) age_type_feature = feature_column.crossed_column( [age_buckets, animal_type], hash_bucket_size=100) feature_columns.append(feature_column.indicator_column(age_type_feature)) feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ), name='Type', dtype=tf.string) return feature_columns, feature_layer_inputs
def classify_data(batch_size=5): from tensorflow import feature_column from tensorflow.keras import layers from sklearn.model_selection import train_test_split URL = 'https://storage.googleapis.com/applied-dl/heart.csv' dataframe = pd.read_csv(URL) tr, te = train_test_split(dataframe, test_size=0.2) tr, va = train_test_split(tr, test_size=0.2) print(len(tr), len(va), len(te)) def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe, labels = dataframe.copy(), dataframe.pop('target') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size) return ds tr_ds = df_to_dataset(tr, batch_size=batch_size) va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size) te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size) feature_columns = [] for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column('age') age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) feature_columns.append(feature_column.indicator_column(thal)) feature_columns.append(feature_column.embedding_column(thal, dimension=8)) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) feature_columns.append(feature_column.indicator_column(crossed_feature)) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(tr_ds, validation_data=va_ds, epochs=5) loss, accuracy = model.evaluate(te_ds) print(accuracy)
def cross_all_columns(): all = categorical_names + integer_names fcs = [] fck = {} # probably way cleaner way to do this # set key indicating pairs have been crossed # if not crossed, append to fcs, and set true for n1 in all: for n2 in all: k1 = "%s%s" % (n1, n2) k2 = "%s%s" % (n2, n1) if fck.get(k1) is None and fck.get(k2) is None: fcs.append(crossed_column([n1, n2], 1e6)) fck[k1] = True fck[k2] = True return fcs
def build_feature_columns(): age = feature_column.numeric_column('age') age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass', ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'])) fnlwgt = feature_column.numeric_column('fnlwgt') education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education', ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'])) education_num = feature_column.numeric_column('education_num') marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status', ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'])) occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation', ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'])) relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship', ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'])) race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'])) gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male'])) capital_gain = feature_column.numeric_column('capital_gain') capital_loss = feature_column.numeric_column('capital_loss') hours_per_week = feature_column.numeric_column('hours_per_week') native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country', ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'])) wide = [age, workclass] deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country] race_gender = feature_column.indicator_column(feature_column.crossed_column([ feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']), feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10)) wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender] deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country] return (wide, deep)
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") if self.parameters.has_key("hash_bucket_size"): hash_bucket_size = self.parameters.get("hash_bucket_size") else: msg = "parameters error, crossed_column must need hash_bucket_size" logger.error(msg) raise ParametersError(msg) column_names = input_tensor_name.split(",") columns = [] for index in range(len(column_names)): input_tensor = output_tensors.get(column_names[index]) columns.append(input_tensor) # combiner = self.parameters.get("combiner") if self.parameters.has_key("combiner") else "mean" output_tensor = fc.crossed_column(keys=columns, hash_bucket_size=hash_bucket_size) output_tensors[output_tensor_name] = output_tensor
def test_crossed_column(): featrues = { 'price': [['A', 'A'], ['B', 'D'], ['C', 'A']], 'color': [['R', 'R'], ['G', 'G'], ['B', 'B']] } price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) p_x_c = feature_column.crossed_column([price, color], 16) p_x_c_identy = feature_column.indicator_column(p_x_c) # what's this? p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([p_x_c_identy_dense_tensor]))
def build_model(): feature_columns = [] for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) feature_layer = keras.layers.DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, keras.layers.Dense(128, activation='relu'), keras.layers.Dense(128, activation='relu'), keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True) return model
def build(self, columns_dict): """ 组建crossed_column :param columns_dict: 已组建的类表{name: columns} :return: """ if not self.keys: columns = [columns_dict[k] for k in self.key_names] invalid_cols = [ f for f in columns if not isinstance(f, CategoricalColumn) ] if invalid_cols: raise ValueError( "{} are not CategoricalColumn".format(invalid_cols)) self.keys = [c.get_input_column() for c in columns] if not self._feature_column: self._feature_column = fc.crossed_column(self.keys, self.hash_bucket_size) return self
def test_crossed_column(): # 1. Input features featrues = { 'price': [['A'], ['B'], ['C'], ['C']], 'color': [['R'], ['G'], ['B'], ['B']] } # 2. Feature columns (Sparse) price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) # 2. Feature columns (Sparse) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) # 2. Feature columns (Sparse) p_x_c = feature_column.crossed_column([price, color], 16) # 2. Feature columns (Dense) p_x_c_identity = feature_column.indicator_column(p_x_c) # 3. Feature tensor p_x_c_identity_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identity]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([p_x_c_identity_dense_tensor]))
age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # indicator cols thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # embedding cols thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # crossed cols crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) batch_size = 32 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) model = tf.keras.Sequential([ feature_layer, tf.keras.layers.Dense(128, activation=tf.nn.relu), tf.keras.layers.Dense(128, activation=tf.nn.relu), tf.keras.layers.Dense(1, activation=tf.nn.sigmoid) ])
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th' ]) marital_status = fc.categorical_column_with_vocabulary_list( 'marital_status', [ 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed' ]) workclass = fc.categorical_column_with_vocabulary_list('workclass', [ 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked' ]) categorical_columns = [ relationship, occupation, education, marital_status, workclass ] # crossed education_x_occupation = fc.crossed_column(['education', 'occupation'], hash_bucket_size=1000) crossed_columns = [education_x_occupation] # train classifier = tf.estimator.LinearClassifier( feature_columns=numeric_columns + categorical_columns + crossed_columns, optimizer=tf.train.FtrlOptimizer(learning_rate=0.1, l1_regularization_strength=0.1, l2_regularization_strength=0.1)) classifier.train(get_train_dataset) result = classifier.evaluate(get_test_dataset) pprint(result)
def input_template_feed_keras(Xtrain, cols_type_received, cols_ref, **kw): """ Create sparse data struccture in KERAS To plug with MODEL: No data, just virtual data https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb :return: """ from tensorflow.feature_column import (categorical_column_with_hash_bucket, numeric_column, embedding_column, bucketized_column, crossed_column, indicator_column) if len(cols_ref) <= 1: return Xtrain dict_sparse, dict_dense = {}, {} for cols_groupname in cols_ref: assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] " if cols_groupname == "cols_sparse": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucket = min(500, int(Xtrain[coli].nunique())) dict_sparse[coli] = categorical_column_with_hash_bucket( coli, hash_bucket_size=m_bucket) if cols_groupname == "cols_dense": col_list = cols_type_received[cols_groupname] for coli in col_list: dict_dense[coli] = numeric_column(coli) if cols_groupname == "cols_cross": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucketi = min(500, int(Xtrain[coli[0]].nunique())) m_bucketj = min(500, int(Xtrain[coli[1]].nunique())) dict_sparse[coli[0] + "-" + coli[1]] = crossed_column( coli[0], coli[1], m_bucketi * m_bucketj) if cols_groupname == "cols_discretize": col_list = cols_type_received[cols_groupname] for coli in col_list: bucket_list = np.linspace(min, max, 100).tolist() dict_sparse[coli + "_bin"] = bucketized_column( numeric_column(coli), bucket_list) #### one-hot encode the sparse columns dict_sparse = { colname: indicator_column(col) for colname, col in dict_sparse.items() } ### Embed dict_embed = { 'em_{}'.format(colname): embedding_column(col, 10) for colname, col in dict_sparse.items() } dict_dnn = {**dict_embed, **dict_dense} dict_linear = {**dict_sparse, **dict_dense} return (dict_linear, dict_dnn)
animal_type = feature_column.categorical_column_with_vocabulary_list('Type', ['Cat', 'Dog']) animal_type_one_hot = feature_column.indicator_column(animal_type) demo(animal_type_one_hot) # Notice the input to the embedding column is the categorical column we previously created breed1 = feature_column.categorical_column_with_vocabulary_list('Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension = 8) demo(breed1_embedding) # Hashed feature columns breed1_hashed = feature_column.categorical_column_with_hash_bucket('Breed1', hash_bucket_size = 10) demo(feature_column.indicator_column(breed1_hashed)) # Crossed feature columns crossed_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size = 10) demo(feature_column.indicator_column(crossed_feature)) # 위 라인에서 죽음.. 어흐... OverflowError: Python int too large to convert to C long # 어떻게 고쳐햐 하나.. # Choose which columns to use feature_columns = [] # numeric cols for header in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(header)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column(age, boundaries = [1, 2, 3, 4, 5]) feature_columns.append(age_buckets)
# sex_fare_cross = feature_column.crossed_column([sex, fare_buckets], hash_bucket_size=1000) # feature_columns.append(feature_column.indicator_column(sex_fare_cross)) # fare_pclass_cross = feature_column.crossed_column([fare_buckets, Pclass], hash_bucket_size=1000) # feature_columns.append(feature_column.indicator_column(fare_pclass_cross)) # embarked_fare_cross = feature_column.crossed_column([fare_buckets, embarked], hash_bucket_size=100) # feature_columns.append(feature_column.indicator_column(embarked_fare_cross)) # age_sib_cross = feature_column.crossed_column([age_buckets, sib_buckets], hash_bucket_size=1000) # feature_columns.append(feature_column.indicator_column(age_sib_cross)) # age_parch_cross = feature_column.crossed_column([age_buckets, parch_buckets], hash_bucket_size=1000) # feature_columns.append(feature_column.indicator_column(age_parch_cross)) sex_pclass_cross = feature_column.crossed_column([sex, parch_buckets], hash_bucket_size=1000) feature_columns.append(feature_column.indicator_column(sex_pclass_cross)) # create feature layer feature_layer = tf.keras.layers.DenseFeatures(feature_columns) # %% # Train Model # Training settings BATCH_SIZE = 100 EPOCS = 50 LEARNING_RATE = 0.0001 L2 = 1e-8 train_ds = df_to_dataset(train, True, BATCH_SIZE, label) test_ds = df_to_dataset(test, False, BATCH_SIZE, label) submission_ds = df_to_dataset(submission_df, False, BATCH_SIZE, has_label=False)
850]) feature_columns_container.append(fico_num) institutions = feature_column.categorical_column_with_vocabulary_list( 'institutionName', [ 'Bank of America', 'Toronto Dominion Bank', 'Citizens Bank', 'Webster Bank', 'CHASE Bank', 'Citigroup', 'Capital One', 'HSBC Bank USA', 'State Street Corporation', 'MUFG Union Bank', 'Wells Fargo & Co.', 'Barclays', 'New York Community Bank', 'CIT Group', 'Santander Bank', 'Royal Bank of Scotland', 'First Rand Bank', 'Budapest Bank' ]) institutions_pos = feature_column.indicator_column(institutions) feature_columns_container.append(institutions_pos) crossed_feat = feature_column.crossed_column([age, fico_num], hash_bucket_size = 1000) crossed_feat = feature_column.indicator_column(crossed_feat) feature_columns_container.append(crossed_feat) ###########EXAMPLES####### #numeric column #age = feature_column.numeric_column("age") #categorical column with vocabulary list #thal = feature_column.categorical_column_with_vocabulary_list( # 'thal', ['fixed', 'normal', 'reversible']) #bucketized column #age_buckets = feature_column.bucketized_column( # age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
def get_dataset_tuple_keras(Xtrain, cols_type_received, cols_ref, **kw): """ Create sparse data struccture from dataframe data to Feed Keras https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb :return: """ from tensorflow.feature_column import (categorical_column_with_hash_bucket, numeric_column, embedding_column, bucketized_column, crossed_column, indicator_column) if len(cols_ref) <= 1: return Xtrain dict_sparse, dict_dense = {}, {} for cols_groupname in cols_ref: assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] " if cols_groupname == "cols_sparse": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucket = min(500, int(Xtrain[coli].nunique())) dict_sparse[coli] = categorical_column_with_hash_bucket( coli, hash_bucket_size=m_bucket) if cols_groupname == "cols_dense": col_list = cols_type_received[cols_groupname] for coli in col_list: dict_dense[coli] = numeric_column(coli) if cols_groupname == "cols_cross": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucketi = min(500, int(Xtrain[coli[0]].nunique())) m_bucketj = min(500, int(Xtrain[coli[1]].nunique())) dict_sparse[coli[0] + "-" + coli[1]] = crossed_column( coli[0], coli[1], m_bucketi * m_bucketj) if cols_groupname == "cols_discretize": col_list = cols_type_received[cols_groupname] for coli in col_list: bucket_list = np.linspace(min, max, 100).tolist() dict_sparse[coli + "_bin"] = bucketized_column( numeric_column(coli), bucket_list) #### one-hot encode the sparse columns dict_sparse = { colname: indicator_column(col) for colname, col in dict_sparse.items() } ### Embed dict_embed = { 'em_{}'.format(colname): embedding_column(col, 10) for colname, col in dict_sparse.items() } dict_dense2 = {**dict_dense, **dict_embed} X_tuple = (dict_sparse, dict_dense, dict_dense2) return X_tuple import tensorflow as tf NBUCKETS = 10 real = { colname: tf.feature_column.numeric_column(colname) for colname in colnumeric } inputs = { colname: tf.keras.layers.Input(name=colname, shape=(), dtype='float32') for colname in real.keys() } sparse = { 'carrier': tf.feature_column.categorical_column_with_vocabulary_list( 'carrier', vocabulary_list='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split( ',')), 'origin': tf.feature_column.categorical_column_with_hash_bucket( 'origin', hash_bucket_size=1000), 'dest': tf.feature_column.categorical_column_with_hash_bucket( 'dest', hash_bucket_size=1000) } inputs.update({ colname: tf.keras.layers.Input(name=colname, shape=(), dtype='string') for colname in sparse.keys() }) latbuckets = np.linspace(20.0, 50.0, NBUCKETS).tolist() # USA lonbuckets = np.linspace(-120.0, -70.0, NBUCKETS).tolist() # USA disc = {} disc.update({ 'd_{}'.format(key): tf.feature_column.bucketized_column(real[key], latbuckets) for key in ['dep_lat', 'arr_lat'] }) disc.update({ 'd_{}'.format(key): tf.feature_column.bucketized_column(real[key], lonbuckets) for key in ['dep_lon', 'arr_lon'] }) # cross columns that make sense in combination sparse['dep_loc'] = tf.feature_column.crossed_column( [disc['d_dep_lat'], disc['d_dep_lon']], NBUCKETS * NBUCKETS) sparse['arr_loc'] = tf.feature_column.crossed_column( [disc['d_arr_lat'], disc['d_arr_lon']], NBUCKETS * NBUCKETS) sparse['dep_arr'] = tf.feature_column.crossed_column( [sparse['dep_loc'], sparse['arr_loc']], NBUCKETS**4) #sparse['ori_dest'] = tf.feature_column.crossed_column(['origin', 'dest'], hash_bucket_size=1000) # embed all the sparse columns embed = { 'embed_{}'.format(colname): tf.feature_column.embedding_column(col, 10) for colname, col in sparse.items() } real.update(embed) # one-hot encode the sparse columns sparse = { colname: tf.feature_column.indicator_column(col) for colname, col in sparse.items() } def wide_and_deep_classifier(inputs, linear_feature_columns, dnn_feature_columns, dnn_hidden_units): deep = tf.keras.layers.DenseFeatures(dnn_feature_columns, name='deep_inputs')(inputs) layers = [int(x) for x in dnn_hidden_units.split(',')] for layerno, numnodes in enumerate(layers): deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno + 1))(deep) wide = tf.keras.layers.DenseFeatures(linear_feature_columns, name='wide_inputs')(inputs) both = tf.keras.layers.concatenate([deep, wide], name='both') output = tf.keras.layers.Dense(1, activation='sigmoid', name='pred')(both) model = tf.keras.Model(inputs, output) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) return model DNN_HIDDEN_UNITS = 10 model = wide_and_deep_classifier(inputs, linear_feature_columns=sparse.values(), dnn_feature_columns=real.values(), dnn_hidden_units=DNN_HIDDEN_UNITS) tf.keras.utils.plot_model(model, 'flights_model.png', show_shapes=False, rankdir='LR') X_tuple = (sparse, real, real) return X_tuple
def tf_data_create_sparse(cols_type_received: dict = { 'cols_sparse': ['col1', 'col2'], 'cols_num': ['cola', 'colb'] }, cols_ref: list = ['col_sparse', 'col_num'], Xtrain: pd.DataFrame = None, **kw): """ Create sparse data struccture in KERAS To plug with MODEL: No data, just virtual data https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb :return: """ import tensorflow from tensorflow.feature_column import (categorical_column_with_hash_bucket, numeric_column, embedding_column, bucketized_column, crossed_column, indicator_column) ### Unique values : col_unique = {} if Xtrain is not None: for coli in cols_type_received['col_sparse']: col_unique[coli] = int(Xtrain[coli].nunique()) dict_cat_sparse, dict_dense = {}, {} for cols_groupname in cols_ref: assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] " if cols_groupname == "cols_sparse": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucket = min(500, col_unique.get(coli, 500)) dict_cat_sparse[coli] = categorical_column_with_hash_bucket( coli, hash_bucket_size=m_bucket) if cols_groupname == "cols_dense": col_list = cols_type_received[cols_groupname] for coli in col_list: dict_dense[coli] = numeric_column(coli) if cols_groupname == "cols_cross": col_list = cols_type_received[cols_groupname] for coli in col_list: m_bucketi = min(500, col_unique.get(coli, 500)) m_bucketj = min(500, col_unique.get(coli, 500)) dict_cat_sparse[coli[0] + "-" + coli[1]] = crossed_column( coli[0], coli[1], m_bucketi * m_bucketj) if cols_groupname == "cols_discretize": col_list = cols_type_received[cols_groupname] for coli in col_list: bucket_list = np.linspace(min, max, 100).tolist() dict_cat_sparse[coli + "_bin"] = bucketized_column( numeric_column(coli), bucket_list) #### one-hot encode the sparse columns dict_cat_sparse = { colname: indicator_column(col) for colname, col in dict_cat_sparse.items() } ### Embed dict_cat_embed = { 'em_{}'.format(colname): embedding_column(col, 10) for colname, col in dict_cat_sparse.items() } #### TO Customisze #dict_dnn = {**dict_cat_embed, **dict_dense} # dict_linear = {**dict_cat_sparse, **dict_dense} return dict_cat_sparse, dict_cat_embed, dict_dense,
def _combination(): education_occupation=fc.crossed_column(['education','occupation'],300) education_occupation=fc.indicator_column(education_occupation) return [education_occupation]
def official_census_feature_columns_config_demo(): # categorical_column gender = fc.categorical_column_with_vocabulary_list( "gender", ["Female", "Male"]) education = fc.categorical_column_with_vocabulary_list( "education", [ "Bachelors", "HS-grad", "11th", "Masters", "9th", "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th", "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th", "Preschool", "12th" ]) marital_status = fc.categorical_column_with_vocabulary_list( "marital_status", [ "Married-civ-spouse", "Divorced", "Married-spouse-absent", "Never-married", "Separated", "Married-AF-spouse", "Widowed" ]) relationship = fc.categorical_column_with_vocabulary_list( "relationship", [ "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried", "Other-relative" ]) workclass = fc.categorical_column_with_vocabulary_list( "workclass", [ "Self-emp-not-inc", "Private", "State-gov", "Federal-gov", "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked" ]) # To show an example of hashing: native_country = fc.categorical_column_with_hash_bucket( "native_country", hash_bucket_size=1000) occupation = fc.categorical_column_with_hash_bucket("occupation", hash_bucket_size=1000) # Continuous feature columns. age = fc.numeric_column("age") education_num = fc.numeric_column("education_num") capital_gain = fc.numeric_column("capital_gain") capital_loss = fc.numeric_column("capital_loss") hours_per_week = fc.numeric_column("hours_per_week") # bucketized transformations. age_buckets = fc.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) # Wide columns and deep columns. base_columns = [ gender, education, marital_status, relationship, workclass, occupation, native_country, age_buckets ] crossed_columns = [ fc.crossed_column(['education', 'occupation'], hash_bucket_size=1000), fc.crossed_column([age_buckets, 'education', 'occupation'], hash_bucket_size=1000), fc.crossed_column(['native_country', 'occupation'], hash_bucket_size=1000) ] feature_columns = [ fc.indicator_column(workclass), fc.indicator_column(education), fc.indicator_column(gender), fc.indicator_column(relationship), fc.embedding_column(native_country, dimension=32), fc.embedding_column(occupation, dimension=32), age, education_num, capital_gain, capital_loss, hours_per_week, ] return feature_columns, base_columns, crossed_columns
# ________________________________MODEL___________________________________ # ______wide model start ________ # distance bucket input distance_bucket_input = Input(shape=(1,), dtype='int64', name='distance_bucket') unique_distance_buckets = list(range(10)) distance_bucket_column = feature_column.categorical_column_with_vocabulary_list('distance_bucket_input', unique_distance_buckets) # is city search input is_city_search_input = Input(shape=(1,), dtype='int64', name='is_city_search') unique_is_city_search = [0,1] is_city_search_column = feature_column.categorical_column_with_vocabulary_list('is_city_search_input', unique_is_city_search) # interation features between distance bucket and city search input distance_city_cross_feature = feature_column.crossed_column([distance_bucket_column, is_city_search_column], hash_bucket_size=20,hash_key=42) distance_city_cross_indicator_feature = feature_column.indicator_column(distance_city_cross_feature) distance_city_cross_dense = layers.DenseFeatures(distance_city_cross_indicator_feature)({'distance_bucket_input':distance_bucket_input, 'is_city_search_input':is_city_search_input}) # create input for hotel type hotel_type_input = Input(shape=(1,), dtype='int64', name='hotel_type') hotel_type = feature_column.categorical_column_with_vocabulary_list( 'hotel_type', unique_hotel_type) # create hotel type click stream inputs and interaction with candidate hotel type clicked_hotel_type_input_sparse = {} crossed_hotel_type_dense = {} for i in CLICKED_HOTEL_TYPE_COLUMNS: clicked_hotel_type_input_sparse[i] = Input(shape=(1,), dtype='int64', name=i) categorical_col = feature_column.categorical_column_with_vocabulary_list(
def build_census_wide_columns(): n_range = get_census_numeric_feat_range() base_columns = [ fc.bucketized_column(fc.numeric_column('age'), boundaries=list( np.linspace(n_range['age'][0], n_range['age'][1], 1000))), fc.bucketized_column(fc.numeric_column('education_num'), boundaries=list( np.linspace(n_range['education_num'][0], n_range['education_num'][1], 1000))), fc.bucketized_column(fc.numeric_column('capital_gain'), boundaries=list( np.linspace(n_range['capital_gain'][0], n_range['capital_gain'][1], 1000))), fc.bucketized_column(fc.numeric_column('capital_loss'), boundaries=list( np.linspace(n_range['capital_loss'][0], n_range['capital_loss'][1], 1000))), fc.bucketized_column(fc.numeric_column('hours_per_week'), boundaries=list( np.linspace(n_range['hours_per_week'][0], n_range['hours_per_week'][1], 1000))), fc.indicator_column( fc.categorical_column_with_hash_bucket('gender', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('education', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('marital_status', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('relationship', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('workclass', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('native_country', hash_bucket_size=1000)), fc.indicator_column( fc.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000)) ] age_buckets = fc.bucketized_column( fc.numeric_column("age"), boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) cross_columns = [ fc.indicator_column( fc.crossed_column(["education", "occupation"], hash_bucket_size=1000)), fc.indicator_column( fc.crossed_column(["native_country", "occupation"], hash_bucket_size=1000)), fc.indicator_column( fc.crossed_column([age_buckets, "education", "occupation"], hash_bucket_size=1000)) ] feature_columns = base_columns + cross_columns feat_field_size = len(feature_columns) return feature_columns, feat_field_size