def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets): # Pass-through columns transformed = inputs.copy() del transformed['pickup_datetime'] feature_columns = { colname: fc.numeric_column(colname) for colname in NUMERIC_COLS } # Scaling longitude from range [-70, -78] to [0, 1] for lon_col in ['pickup_longitude', 'dropoff_longitude']: transformed[lon_col] = Lambda(lambda x: (x + 78) / 8.0, name='scale_{}'.format(lon_col))( inputs[lon_col]) # Scaling latitude from range [37, 45] to [0, 1] for lat_col in ['pickup_latitude', 'dropoff_latitude']: transformed[lat_col] = Lambda(lambda x: (x - 37) / 8.0, name='scale_{}'.format(lat_col))( inputs[lat_col]) # Adding Euclidean dist (no need to be accurate: NN will calibrate it) transformed['euclidean'] = Lambda(euclidean, name='euclidean')([ inputs['pickup_longitude'], inputs['pickup_latitude'], inputs['dropoff_longitude'], inputs['dropoff_latitude'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # hour of day from timestamp of form '2010-02-08 09:17:00+00:00' transformed['hourofday'] = Lambda( lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2), out_type=tf.dtypes.int32), name='hourofday')(inputs['pickup_datetime']) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', num_buckets=24)) latbuckets = np.linspace(0, 1, nbuckets).tolist() lonbuckets = np.linspace(0, 1, nbuckets).tolist() b_plat = fc.bucketized_column(feature_columns['pickup_latitude'], latbuckets) b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'], latbuckets) b_plon = fc.bucketized_column(feature_columns['pickup_longitude'], lonbuckets) b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'], lonbuckets) ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets) dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets) pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4) feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100) return transformed, feature_columns
def test_bucketized_column(): sample = { 'price': [[5.], [16], [25], [36]], 'time': [[2.], [6], [8], [15]] } price_column = feature_column.numeric_column('price') bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30, 40]) price_bucket_tensor = feature_column.input_layer(sample, [bucket_price]) time_column = feature_column.numeric_column('time') bucket_time = feature_column.bucketized_column(time_column, [5, 10, 12]) time_bucket_tensor = feature_column.input_layer(sample, [bucket_time]) with tf.Session() as session: print(session.run([price_bucket_tensor, time_bucket_tensor]))
def build_features(statistics): pu_location_id = fc.categorical_column_with_identity(key='PULocationID', num_buckets=265) do_location_id = fc.categorical_column_with_identity(key='DOLocationID', num_buckets=265) day_of_week = fc.categorical_column_with_identity(key='day_of_week', num_buckets=7) weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2) speed_buckets = fc.bucketized_column( fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70]) distance_buckets = fc.bucketized_column( fc.numeric_column('trip_distance'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) duration_buckets = fc.bucketized_column( fc.numeric_column('duration'), boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]) fare_buckets = fc.bucketized_column( fc.numeric_column('fare_amount'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) passenger_buckets = fc.bucketized_column( fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9]) location = fc.crossed_column([pu_location_id, do_location_id], hash_bucket_size=1000) cross_all = fc.crossed_column([ location, speed_buckets, distance_buckets, duration_buckets, fare_buckets, passenger_buckets ], hash_bucket_size=1000) categorical_columns = [ fc.embedding_column(pu_location_id, dimension=32), fc.embedding_column(do_location_id, dimension=32), fc.indicator_column(day_of_week), fc.indicator_column(weekend) ] numeric_columns = [ custom_numeric_column('passenger_count', statistics), custom_numeric_column('trip_distance', statistics), custom_numeric_column('fare_amount', statistics), custom_numeric_column('extra', statistics), custom_numeric_column('mta_tax', statistics), custom_numeric_column('tolls_amount', statistics), custom_numeric_column('improvement_surcharge', statistics), custom_numeric_column('duration', statistics), custom_numeric_column('speed', statistics) ] dnn_feature_columns = numeric_columns + categorical_columns linear_feature_columns = [location, cross_all] return dnn_feature_columns, linear_feature_columns
def get_item_feature_columns(business_vocab_list, item_type_dict): items_feature_columns = [] bucketized_boundary = {'stars': [2.5, 4]} embedding_size = {"categories": 8, "city": 4} for k, v in business_vocab_list.items(): if k in ['review_count']: col = numeric_column(k, default_value=0, dtype=item_type_dict[k]) elif k in ['stars']: col = bucketized_column( numeric_column(k, default_value=0, dtype=item_type_dict[k]), bucketized_boundary[k]) elif k in ['categories', 'city']: col = embedding_column(categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k]), dimension=embedding_size[k]) else: col = indicator_column( categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k])) items_feature_columns.append(col) return items_feature_columns
def define_feature_columns(dataframe): print("Defining feature columns...") feature_columns = [] # Create embedding column for name IDs name_id = feature_column.categorical_column_with_vocabulary_list( 'nconst', dataframe.nconst.unique()) # Dimension set to 30 (approximately fourth root of the number of unique name IDs) name_id_embedding = feature_column.embedding_column(name_id, dimension=30) feature_columns.append(name_id_embedding) # Create indicator columns for category and genres indicator_column_names = ['category', 'genres'] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) # Create bucketized column for startYear (a.k.a. release date) start_year_numeric = feature_column.numeric_column('startYear') start_year_bucket = feature_column.bucketized_column( start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015]) feature_columns.append(start_year_bucket) print("Feature columns defined") return feature_columns
def bucketized_columns(self, columnsBoundaries): for key, value in columnsBoundaries.items(): col = feature_column.numeric_column(key) col_buckets = feature_column.bucketized_column(col, boundaries=value) self.sparse_columns[key] = col_buckets return col_buckets
def _get_tf_feature_cols(dataframe: pd.DataFrame): feature_columns = [] # numeric cols for header in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(header)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column( age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column( categorical_column) feature_columns.append(indicator_column) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=8) feature_columns.append(breed1_embedding) return feature_columns
def _prepare_for_crossing(self, key_name, num_bck, boundaries): """Prepares features for crossing. Whether they're continuous or categorical matters, and whether we have the whole dictionary or not. Args: key_name: A string representing the name of the feature num_bck: How many buckets to use when we know # of distinct values boundaries: Range used for boundaries when bucketinizing Returns: key name """ key = None if key_name in self.continuous.keys(): if boundaries is not None: # Note that cont[key_name] is a source column key = tfc.bucketized_column(self.continuous[key_name], boundaries) else: # We can count all the values in the dataset. Ex: boolean. # Note that key_name is a string key = tfc.categorical_column_with_identity(key_name, num_bck) elif key_name in self.categorical.keys(): # It is also possible to use the categorical column instead of the # column name. i.e key = cat[key_name] key = key_name else: key = key_name return key
def _build_census_wide_columns(numeric_range=None): base_columns, cross_columns = [], [] for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']: base_columns.append( fc.indicator_column( fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000))) for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']: base_columns.append( fc.bucketized_column(fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000)))) for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']: cross_columns.append( fc.indicator_column( fc.crossed_column([col[0], col[1]], hash_bucket_size=10000))) feature_columns = base_columns + cross_columns feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def create_feature_columns(note_emb_size=10, note_user_emb_size=6): # 先创建分类列 creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators", hash_bucket_size=2000, dtype=tf.string) note_ids = fc.categorical_column_with_hash_bucket("last_note_ids", 20000, dtype=tf.int64) creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000) note_id = fc.categorical_column_with_hash_bucket("note_id", 20000, dtype=tf.int64) video_duration = fc.numeric_column("note_video_duration") video_duration_bucket = fc.bucketized_column(source_column=video_duration, boundaries=[5, 10, 30, 60]) note_emb = fc.shared_embedding_columns([note_ids, note_id], note_emb_size, combiner='sum') creator_emb = fc.shared_embedding_columns([creator_ids, creator_id], note_user_emb_size, combiner='sum') my_feature_columns = note_emb + creator_emb + [video_duration_bucket] print("*" * 100) print("feature columns:") for i in my_feature_columns: print(i) print("*" * 100) return my_feature_columns
def _add_bucketed_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table # 如果是fixed_len的list特征 if feature_table[f].feature_spec.is_list and feature_table[ f].feature_spec.fixed: size = feature_table[f].feature_spec.size if feature_table[f].feature_spec.dtype == "int": numeric_col = fc.numeric_column(f, shape=(size, ), dtype=tf.int64, default_value=0) else: numeric_col = fc.numeric_column(f, shape=(size, ), default_value=0) # 如果不是list特征 else: if feature_table[f].feature_spec.dtype == "int": numeric_col = fc.numeric_column(f, dtype=tf.int64, default_value=0) else: numeric_col = fc.numeric_column(f, default_value=0) bucketed_col = fc.bucketized_column(numeric_col, boundaries=BUCKET_BOUNDARIES[f]) embedding_col = fc.embedding_column(bucketed_col, feature_table[f].emb_width, combiner='sqrtn') columns.append(embedding_col)
def test_bucketized_column(): price = {'price': [[5.], [15.], [25.], [35.]]} # 4行样本 shape =[4,1] price_column = feature_column.numeric_column('price') bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30, 40]) price_bucket_tensor = feature_column.input_layer(price, [bucket_price]) with tf.Session() as session: print(session.run([price_bucket_tensor]))
def pratise(): d = {'x': [[32], [16], [38], [98]]} cd = feature_column.numeric_column('x') bcd = feature_column.bucketized_column(cd, [10, 20, 40, 60]) fcd = feature_column.input_layer(d, [bcd]) with tf.Session() as sess: print(sess.run(fcd))
def data_preprocessing(self): """ batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다. train_ds = self.df_to_dataset(self.train, batch_size=batch_size) val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('전체 특성:', list(feature_batch.keys())) print('나이 특성의 배치:', feature_batch['age']) print('타깃의 배치:', label_batch) # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다. self.example_batch = next(iter(train_ds))[0] age = feature_column.numeric_column("age") self.demo(age) """ feature_columns = [] # 수치형 열 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 버킷형 열 age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 범주형 열 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 임베딩 열 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 교차 특성 열 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) self.feature_layer = layers.DenseFeatures(feature_columns) batch_size = 32 self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size) self.val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) self.test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)
def create_feature_layer(df): week = feature_column.numeric_column("Week") boundaries = [] for i in range(1, 53): boundaries.append(i) week = feature_column.bucketized_column(week, boundaries=boundaries) day = feature_column.numeric_column("Day") boundaries = [] for i in range(1, 8): boundaries.append(i) day = feature_column.bucketized_column(day, boundaries=boundaries) year = feature_column.numeric_column("Year") boundaries = [] for i in range(2013, 2017): boundaries.append(i) year = feature_column.bucketized_column(year, boundaries=boundaries) hour = feature_column.numeric_column("std_hour") boundaries = [] for i in range(0, 24): boundaries.append(i) hour = feature_column.bucketized_column(hour, boundaries=boundaries) arrival = feature_column.categorical_column_with_vocabulary_list( "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist()) airline = feature_column.categorical_column_with_vocabulary_list( "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist()) flight_no = feature_column.categorical_column_with_vocabulary_list( "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist()) arrival_one_hot = feature_column.indicator_column(arrival) airline_one_hot = feature_column.indicator_column(airline) flight_no_one_hot = feature_column.indicator_column(flight_no) arrival_length = len(pd.Series.unique(df.Arrival).tolist()) arrival_and_week = feature_column.crossed_column( [arrival, week], hash_bucket_size=(arrival_length * 52)) arrival_and_week = feature_column.indicator_column(arrival_and_week) airline_length = len(pd.Series.unique(df.Airline).tolist()) year_and_airline = feature_column.crossed_column( [year, airline], hash_bucket_size=(airline_length * 4)) year_and_airline = feature_column.indicator_column(year_and_airline) feature_columns = [] feature_columns = feature_columns + [ week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour, arrival_and_week, year, year_and_airline ] feature_layer = tf.keras.layers.DenseFeatures(feature_columns) return feature_layer
def test_bucketized_column(): # 1. Input features price = {'price': [[15.], [5.], [35.], [25.]]} # 2. Feature columns (Dense) price_column = feature_column.numeric_column('price') # 2. Feature columns (Dense): bucketized_column is both Dense and # Categorical bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30]) # 3. Feature tensor price_bucket_tensor = feature_column.input_layer(price, [bucket_price]) with tf.Session() as session: print(session.run([price_bucket_tensor]))
def get_feature_columns(dataframe): """Creates feature columns from pd.DataFrame.""" feature_columns = [] feature_layer_inputs = {} # numeric cols for col_name in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(col_name)) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ), name=col_name, dtype=tf.string) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=16) feature_columns.append(breed1_embedding) feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ), name='Breed1', dtype=tf.string) # crossed columns animal_type = feature_column.categorical_column_with_vocabulary_list( 'Type', ['Cat', 'Dog']) feature_columns.append(feature_column.indicator_column(animal_type)) age_type_feature = feature_column.crossed_column( [age_buckets, animal_type], hash_bucket_size=100) feature_columns.append(feature_column.indicator_column(age_type_feature)) feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ), name='Type', dtype=tf.string) return feature_columns, feature_layer_inputs
def test_elasticsearch_io_dataset_training(): """Test the functionality of the ElasticsearchIODataset by training a tf.keras model on the structured data. """ BATCH_SIZE = 2 dataset = tfio.experimental.elasticsearch.ElasticsearchIODataset( nodes=[NODE], index=INDEX, doc_type=DOC_TYPE, headers=HEADERS) dataset = dataset.map(lambda v: (v, v.pop("survived"))) dataset = dataset.batch(BATCH_SIZE) assert issubclass(type(dataset), tf.data.Dataset) feature_columns = [] # Numeric column fare_column = feature_column.numeric_column("fare") feature_columns.append(fare_column) # Bucketized column age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column(age, boundaries=[10, 30]) feature_columns.append(age_buckets) # Categorical column gender = feature_column.categorical_column_with_vocabulary_list( "gender", ["Male", "Female"]) gender_indicator = feature_column.indicator_column(gender) feature_columns.append(gender_indicator) # Convert the feature columns into a tf.keras layer feature_layer = tf.keras.layers.DenseFeatures(feature_columns) # Build the model model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation="relu"), layers.Dense(128, activation="relu"), layers.Dropout(0.1), layers.Dense(1), ]) # Compile the model model.compile( optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"], ) # train the model model.fit(dataset, epochs=5)
def test_train_model(): """Test the dataset by training a tf.keras model""" dataset = tfio.experimental.mongodb.MongoDBIODataset(uri=URI, database=DATABASE, collection=COLLECTION) dataset = dataset.map( lambda x: tfio.experimental.serialization.decode_json(x, specs=SPECS)) dataset = dataset.map(lambda v: (v, v.pop("survived"))) dataset = dataset.batch(BATCH_SIZE) assert issubclass(type(dataset), tf.data.Dataset) feature_columns = [] # Numeric column fare_column = feature_column.numeric_column("fare") feature_columns.append(fare_column) # Bucketized column age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column(age, boundaries=[10, 30]) feature_columns.append(age_buckets) # Categorical column gender = feature_column.categorical_column_with_vocabulary_list( "gender", ["Male", "Female"]) gender_indicator = feature_column.indicator_column(gender) feature_columns.append(gender_indicator) # Convert the feature columns into a tf.keras layer feature_layer = tf.keras.layers.DenseFeatures(feature_columns) # Build the model model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation="relu"), layers.Dense(128, activation="relu"), layers.Dropout(0.1), layers.Dense(1), ]) # Compile the model model.compile( optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"], ) # train the model model.fit(dataset, epochs=5)
def build_model(genres, traits, num_cat=10): """ :param genres: list of genre names :param traits: list of personality trait names :param num_cat: int, number of categories for classification :return: """ # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # load training set data # train, val, test = split_data(training_set) # create a small batch for test purposes prior to build # train_ds_demo = df_to_dataset(train, batch_size=5) # for feature_batch, label_batch in train_ds_demo.take(1): # print('Every feature:', list(feature_batch.keys())) # print('A batch of Openness:', feature_batch['Openness']) # print('A batch of BinProbs:', label_batch) # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # build model features # create bucketized trait columns # create personality trait columns bounds = list(np.linspace(0, 100, 51)) trait_buckets = [] for trait in traits: trait_feat = feature_column.numeric_column(trait) bucketized_feat = feature_column.bucketized_column(trait_feat, boundaries=bounds) trait_buckets.append(bucketized_feat) # create categorical genre columns genre_feat = feature_column.categorical_column_with_vocabulary_list( 'genreName', genres) genre_one_hot = feature_column.indicator_column(genre_feat) feature_columns = trait_buckets + [genre_one_hot] feature_layer = DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, Dense(128, activation='relu'), Dense(128, activation='relu'), Dense(num_cat) ]) model = compile_model(model) return model
def __init__(self, name, params): super(MLP, self).__init__() self.model_name = name self.params = params num_features = [ feature_column.bucketized_column( feature_column.numeric_column(str(i)), boundaries=[ j / (num_bin_size[i] - 1) for j in range(num_bin_size[i] - 1) ]) for i in range(8) ] if name == "MLP_FSIW": print("using elapse feature") num_features.append(feature_column.numeric_column("elapse")) cate_features = [ feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( str(i), hash_bucket_size=cate_bin_size[i - 8]), dimension=8) for i in range(8, 17) ] all_features = num_features + cate_features self.feature_layer = tf.keras.layers.DenseFeatures(all_features) self.fc1 = layers.Dense(256, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn1 = layers.BatchNormalization() self.fc2 = layers.Dense(256, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn2 = layers.BatchNormalization() self.fc3 = layers.Dense(128, activation=tf.nn.leaky_relu, kernel_regularizer=regularizers.l2( params["l2_reg"])) self.bn3 = layers.BatchNormalization() print("build model {}".format(name)) if self.model_name == "MLP_EXP_DELAY": self.fc4 = layers.Dense(2) elif self.model_name == "MLP_tn_dp": self.fc4 = layers.Dense(2) elif self.model_name in ["MLP_SIG", "MLP_FSIW"]: self.fc4 = layers.Dense(1) else: raise ValueError("model name {} not exist".format(name))
def test_bucketized_column(): data = { 'price': [[5.], [15.], [25.], [35.]], 'price2': [[5.], [15.], [25.], [35.]] } # 4行样本 price_column = feature_column.numeric_column('price') price_column2 = feature_column.numeric_column('price2') print(price_column) bucket_price = feature_column.bucketized_column(price_column, [0, 10, 20, 30, 40]) bucket_price2 = feature_column.bucketized_column(price_column2, [0, 10, 20, 30, 40]) print(bucket_price) price_bucket_tensor = feature_column.input_layer( data, [bucket_price, bucket_price2]) print(type(price_bucket_tensor)) with tf.Session() as session: print(session.run([price_bucket_tensor]))
def classify_data(batch_size=5): from tensorflow import feature_column from tensorflow.keras import layers from sklearn.model_selection import train_test_split URL = 'https://storage.googleapis.com/applied-dl/heart.csv' dataframe = pd.read_csv(URL) tr, te = train_test_split(dataframe, test_size=0.2) tr, va = train_test_split(tr, test_size=0.2) print(len(tr), len(va), len(te)) def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe, labels = dataframe.copy(), dataframe.pop('target') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size) return ds tr_ds = df_to_dataset(tr, batch_size=batch_size) va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size) te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size) feature_columns = [] for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column('age') age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) feature_columns.append(feature_column.indicator_column(thal)) feature_columns.append(feature_column.embedding_column(thal, dimension=8)) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) feature_columns.append(feature_column.indicator_column(crossed_feature)) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(tr_ds, validation_data=va_ds, epochs=5) loss, accuracy = model.evaluate(te_ds) print(accuracy)
def create_features_columns(self): # 向量类特征 user_vector = fc.numeric_column(key="user_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) item_vector = fc.numeric_column(key="item_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) # 分桶类特征 age = fc.numeric_column(key="age", shape=(1, ), default_value=[0], dtype=tf.int64) age = fc.bucketized_column( input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80]) age = fc.embedding_column(age, dimension=32, combiner='mean') # 分类特征 city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0) city = fc.embedding_column(city, dimension=32, combiner='mean') # hash特征 device_id = fc.categorical_column_with_hash_bucket( key="device_id", hash_bucket_size=1000000, dtype=tf.int64) device_id = fc.embedding_column(device_id, dimension=32, combiner='mean') item_id = fc.categorical_column_with_hash_bucket( key="item_id", hash_bucket_size=10000, dtype=tf.int64) item_id = fc.embedding_column(device_id, dimension=32, combiner='mean') self.user_columns["user_vector"] = user_vector self.user_columns["age"] = age self.user_columns["city"] = city self.user_columns["device_id"] = device_id self.item_columns["item_vector"] = item_vector self.item_columns["item_id"] = item_id self.feature_spec = tf.feature_column.make_parse_example_spec( self.user_columns.values() + self.item_columns.values()) return self
def build_feature_columns(): age = feature_column.numeric_column('age') age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass', ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'])) fnlwgt = feature_column.numeric_column('fnlwgt') education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education', ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'])) education_num = feature_column.numeric_column('education_num') marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status', ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'])) occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation', ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'])) relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship', ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'])) race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'])) gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male'])) capital_gain = feature_column.numeric_column('capital_gain') capital_loss = feature_column.numeric_column('capital_loss') hours_per_week = feature_column.numeric_column('hours_per_week') native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country', ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'])) wide = [age, workclass] deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country] race_gender = feature_column.indicator_column(feature_column.crossed_column([ feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']), feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10)) wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender] deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country] return (wide, deep)
def create_feature_columns(note_emb_size=10, note_user_emb_size=6): # 先创建分类列 creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators", hash_bucket_size=2000, dtype=tf.string) note_ids = fc.categorical_column_with_hash_bucket("last_note_ids", 20000, dtype=tf.int64) creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000) note_id = fc.categorical_column_with_hash_bucket("note_id", 20000, dtype=tf.int64) video_duration = fc.numeric_column("note_video_duration") video_duration_bucket = fc.bucketized_column(source_column=video_duration, boundaries=[5, 10, 30, 60]) note_emb = fc.shared_embedding_columns([note_ids, note_id], note_emb_size, combiner='sum') creator_emb = fc.shared_embedding_columns([creator_ids, creator_id], note_user_emb_size, combiner='sum') # phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) # phoneBrand = fc.embedding_column(phoneBrandId, 20) # phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) # phoneResolution = fc.embedding_column(phoneResolutionId, 10) # phoneOs = fc.indicator_column(fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) # city_id = fc.categorical_column_with_hash_bucket("city", 700) # city = fc.embedding_column(city_id, 16) # hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) my_feature_columns = note_emb + creator_emb + [video_duration_bucket] print("*" * 100) print("feature columns:") for i in my_feature_columns: print(i) print("*" * 100) return my_feature_columns
def _build_census_deep_columns(emb_dim=8, numeric_range=None): feature_columns = [] for col in ALI_DISPLAY_ADS_CONFIG['deep_emb_cols']: feature_columns.append( fc.embedding_column(fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000), dimension=emb_dim)) for col in ALI_DISPLAY_ADS_CONFIG['deep_bucket_emb_cols']: feature_columns.append( fc.embedding_column(fc.bucketized_column( fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000))), dimension=emb_dim)) feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def feature_json_parse(): feature_json = open('test.json', 'r').read() feature_json = demjson.decode(feature_json) feature_columns = [] for feature_line in feature_json['tensorTransform']: feature_type_name = feature_line['name'] feature_para = feature_line['parameters'] if feature_type_name == 'NumericColumn': feature_columns.append( feature_column.numeric_column(feature_para['input_tensor'])) elif feature_type_name == 'BucketizedColumn': feature = feature_column.numeric_column( feature_para['input_tensor']) feature_columns.append( feature_column.bucketized_column( feature, boundaries=feature_para['boundaries'])) else: print(feature_type_name)
def build_model(): feature_columns = [] for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) feature_layer = keras.layers.DenseFeatures(feature_columns) model = tf.keras.Sequential([ feature_layer, keras.layers.Dense(128, activation='relu'), keras.layers.Dense(128, activation='relu'), keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True) return model
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") if self.parameters.has_key("boundaries"): boundaries = self.parameters.get("boundaries") if not isinstance(boundaries, list): boundaries = str(boundaries).replace(' ', '') pattern = re.compile('np.linspace\(([0-9]+\.[0-9]+),([0-9]+\.[0-9]+),([0-9]+\.[0-9]+)\)') result = pattern.findall(boundaries) boundaries = list(np.linspace(float(result[0][0]), float(result[0][1]), float(result[0][2]))) else: msg = "parameters error, sparse_column_with_keys must need keys" logger.error(msg) raise ParametersError(msg) print("input_tensor_name:",input_tensor_name) input_tensor = output_tensors.get(input_tensor_name) output_tensors[output_tensor_name] = fc.bucketized_column( source_column=input_tensor, boundaries=boundaries)