def get_feature_columns(): ''' 获取特征列 ''' dnn_feature_columns = list() linear_feature_columns = list() # DNN features user_cate = fc.categorical_column_with_hash_bucket("userid", 40000, tf.int64) feed_cate = fc.categorical_column_with_hash_bucket("feedid", 240000, tf.int64) author_cate = fc.categorical_column_with_hash_bucket("authorid", 40000, tf.int64) bgm_singer_cate = fc.categorical_column_with_hash_bucket("bgm_singer_id", 40000, tf.int64) bgm_song_cate = fc.categorical_column_with_hash_bucket("bgm_song_id", 60000, tf.int64) user_embedding = fc.embedding_column(user_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) feed_embedding = fc.embedding_column(feed_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) author_embedding = fc.embedding_column(author_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2) bgm_singer_embedding = fc.embedding_column(bgm_singer_cate, FLAGS.embed_dim) bgm_song_embedding = fc.embedding_column(bgm_song_cate, FLAGS.embed_dim) dnn_feature_columns.append(user_embedding) dnn_feature_columns.append(feed_embedding) dnn_feature_columns.append(author_embedding) dnn_feature_columns.append(bgm_singer_embedding) dnn_feature_columns.append(bgm_song_embedding) # Linear features video_seconds = fc.numeric_column("videoplayseconds", default_value=0.0) device = fc.numeric_column("device", default_value=0.0) linear_feature_columns.append(video_seconds) linear_feature_columns.append(device) # 行为统计特征 for b in FEA_COLUMN_LIST: feed_b = fc.numeric_column(b+"sum", default_value=0.0) linear_feature_columns.append(feed_b) user_b = fc.numeric_column(b+"sum_user", default_value=0.0) linear_feature_columns.append(user_b) return dnn_feature_columns, linear_feature_columns
def create_features_columns(self): userID = fc.embedding_column( fc.categorical_column_with_hash_bucket( key="userID", hash_bucket_size=FLAGS.user_did_size, dtype=tf.int64), dimension=FLAGS.embed_size, initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)) itemID = fc.embedding_column( fc.categorical_column_with_hash_bucket( key="itemID", hash_bucket_size=FLAGS.item_uuid_size, dtype=tf.int64), dimension=FLAGS.embed_size, initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)) self.all_columns["userID"] = userID self.all_columns["itemID"] = itemID self.feature_spec = tf.feature_column.make_parse_example_spec( self.all_columns.values()) return self
def create_feature_columns(): # user feature phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("feature columns:", my_feature_columns) return my_feature_columns
def make_feature_layer(self): feature_cols = [] for col in self.numeric_column: feature_cols.append(feature_column.numeric_column(col)) for col in self.categorical_column_num: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=int(3 * unique_count)), dimension=1) feature_cols.append(feat_cols) for col in self.categorical_column_text: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=int(3 * unique_count)), dimension=1) feature_cols.append(feat_cols) for col in self.bool_column: unique_count = self.data[col].nunique() feat_cols = feature_column.embedding_column( feature_column.categorical_column_with_hash_bucket( col, hash_bucket_size=3), dimension=1) feature_cols.append(feat_cols) return feature_cols
def build_ama_ele_columns(): feature_columns = [ fc.embedding_column(fc.categorical_column_with_hash_bucket( 'user_id', hash_bucket_size=200000), dimension=32), fc.embedding_column(fc.categorical_column_with_hash_bucket( 'item_id', hash_bucket_size=1000), dimension=32), # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq', hash_bucket_size=200000),dimension=32), # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq_cate',hash_bucket_size=200000),dimension=32), ] feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def create_feature_columns(train_data): n_users = train_data.user.nunique() users = fc.categorical_column_with_vocabulary_list("user", np.arange(n_users), default_value=-1, dtype=tf.int64) gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"]) age = fc.categorical_column_with_vocabulary_list("age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64) occupation = fc.categorical_column_with_vocabulary_list("occupation", np.arange(21), dtype=tf.int64) all_feature_cols = [fc.embedding_column(users, 32), fc.indicator_column(gender), fc.embedding_column(age, 32), fc.embedding_column(occupation, 32)] return all_feature_cols
def build_model_columns(embedding_size): linear_feature_columns = [] embedding_feature_columns = [] u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64) u_id_embedded = feature_column.embedding_column(u_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(u_id)) embedding_feature_columns.append(u_id_embedded) i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64) i_id_embedded = feature_column.embedding_column(i_id, embedding_size) linear_feature_columns.append(feature_column.indicator_column(i_id)) embedding_feature_columns.append(i_id_embedded) return linear_feature_columns, embedding_feature_columns
def build_features(statistics): pu_location_id = fc.categorical_column_with_identity(key='PULocationID', num_buckets=265) do_location_id = fc.categorical_column_with_identity(key='DOLocationID', num_buckets=265) day_of_week = fc.categorical_column_with_identity(key='day_of_week', num_buckets=7) weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2) speed_buckets = fc.bucketized_column( fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70]) distance_buckets = fc.bucketized_column( fc.numeric_column('trip_distance'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) duration_buckets = fc.bucketized_column( fc.numeric_column('duration'), boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]) fare_buckets = fc.bucketized_column( fc.numeric_column('fare_amount'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) passenger_buckets = fc.bucketized_column( fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9]) location = fc.crossed_column([pu_location_id, do_location_id], hash_bucket_size=1000) cross_all = fc.crossed_column([ location, speed_buckets, distance_buckets, duration_buckets, fare_buckets, passenger_buckets ], hash_bucket_size=1000) categorical_columns = [ fc.embedding_column(pu_location_id, dimension=32), fc.embedding_column(do_location_id, dimension=32), fc.indicator_column(day_of_week), fc.indicator_column(weekend) ] numeric_columns = [ custom_numeric_column('passenger_count', statistics), custom_numeric_column('trip_distance', statistics), custom_numeric_column('fare_amount', statistics), custom_numeric_column('extra', statistics), custom_numeric_column('mta_tax', statistics), custom_numeric_column('tolls_amount', statistics), custom_numeric_column('improvement_surcharge', statistics), custom_numeric_column('duration', statistics), custom_numeric_column('speed', statistics) ] dnn_feature_columns = numeric_columns + categorical_columns linear_feature_columns = [location, cross_all] return dnn_feature_columns, linear_feature_columns
def create_features_columns(self): # 向量类特征 user_vector = fc.numeric_column(key="user_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) item_vector = fc.numeric_column(key="item_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) # 分桶类特征 age = fc.numeric_column(key="age", shape=(1, ), default_value=[0], dtype=tf.int64) age = fc.bucketized_column( input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80]) age = fc.embedding_column(age, dimension=32, combiner='mean') # 分类特征 city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0) city = fc.embedding_column(city, dimension=32, combiner='mean') # hash特征 device_id = fc.categorical_column_with_hash_bucket( key="device_id", hash_bucket_size=1000000, dtype=tf.int64) device_id = fc.embedding_column(device_id, dimension=32, combiner='mean') item_id = fc.categorical_column_with_hash_bucket( key="item_id", hash_bucket_size=10000, dtype=tf.int64) item_id = fc.embedding_column(device_id, dimension=32, combiner='mean') self.user_columns["user_vector"] = user_vector self.user_columns["age"] = age self.user_columns["city"] = city self.user_columns["device_id"] = device_id self.item_columns["item_vector"] = item_vector self.item_columns["item_id"] = item_id self.feature_spec = tf.feature_column.make_parse_example_spec( self.user_columns.values() + self.item_columns.values()) return self
def define_feature_columns(dataframe): print("Defining feature columns...") feature_columns = [] # Create embedding column for name IDs name_id = feature_column.categorical_column_with_vocabulary_list( 'nconst', dataframe.nconst.unique()) # Dimension set to 30 (approximately fourth root of the number of unique name IDs) name_id_embedding = feature_column.embedding_column(name_id, dimension=30) feature_columns.append(name_id_embedding) # Create indicator columns for category and genres indicator_column_names = ['category', 'genres'] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column(categorical_column) feature_columns.append(indicator_column) # Create bucketized column for startYear (a.k.a. release date) start_year_numeric = feature_column.numeric_column('startYear') start_year_bucket = feature_column.bucketized_column( start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015]) feature_columns.append(start_year_bucket) print("Feature columns defined") return feature_columns
def test_multi_value_embedding(): color_data = { 'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'], ['B', 'R']] } color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7) color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor]))
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") if self.parameters.has_key("dimension"): dimension = self.parameters.get("dimension") else: msg = "parameters error, embedding_column must need dimension" logger.error(msg) raise ParametersError(msg) input_tensor = output_tensors.get(input_tensor_name) ckpt_to_load_from = None tensor_name_in_ckpt = None if self.parameters.has_key( "ckpt_to_load_from") and self.parameters.has_key( "tensor_name_in_ckpt"): ckpt_to_load_from = self.parameters.get("ckpt_to_load_from") tensor_name_in_ckpt = self.parameters.get("tensor_name_in_ckpt") combiner = self.parameters.get("combiner") if self.parameters.has_key( "combiner") else "mean" output_tensor = fc.embedding_column( categorical_column=input_tensor, dimension=dimension, combiner=combiner, ckpt_to_load_from=ckpt_to_load_from, tensor_name_in_ckpt=tensor_name_in_ckpt) output_tensors[output_tensor_name] = output_tensor
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string): f = feature_column.categorical_column_with_hash_bucket( feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype) f1 = feature_column.embedding_column(f, 4) # 可以被训练的,就需要在main里面对变量进行初始化 # 把feature_column对象转为 dense tensor,所有的feature_column都必须做这步 feature_layer = tf.keras.layers.DenseFeatures([f1]) # 继承自Layer基类,会自动实现call方法 return feature_layer
def _get_tf_feature_cols(dataframe: pd.DataFrame): feature_columns = [] # numeric cols for header in ['PhotoAmt', 'Fee', 'Age']: feature_columns.append(feature_column.numeric_column(header)) # bucketized cols age = feature_column.numeric_column('Age') age_buckets = feature_column.bucketized_column( age, boundaries=[1, 2, 3, 4, 5]) feature_columns.append(age_buckets) # indicator_columns indicator_column_names = [ 'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health' ] for col_name in indicator_column_names: categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) indicator_column = feature_column.indicator_column( categorical_column) feature_columns.append(indicator_column) # embedding columns breed1 = feature_column.categorical_column_with_vocabulary_list( 'Breed1', dataframe.Breed1.unique()) breed1_embedding = feature_column.embedding_column(breed1, dimension=8) feature_columns.append(breed1_embedding) return feature_columns
def _add_weighted_embedding_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table weighted_column = fc.weighted_categorical_column( fc.categorical_column_with_vocabulary_list(f, vocabulary.vocab[f]), f + _WEIGHTED_SUFFIX) emb_weighted_column = fc.embedding_column(weighted_column, feature_table[f].emb_width, combiner='sqrtn') columns.append(emb_weighted_column)
def _add_bucketed_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table # 如果是fixed_len的list特征 if feature_table[f].feature_spec.is_list and feature_table[ f].feature_spec.fixed: size = feature_table[f].feature_spec.size if feature_table[f].feature_spec.dtype == "int": numeric_col = fc.numeric_column(f, shape=(size, ), dtype=tf.int64, default_value=0) else: numeric_col = fc.numeric_column(f, shape=(size, ), default_value=0) # 如果不是list特征 else: if feature_table[f].feature_spec.dtype == "int": numeric_col = fc.numeric_column(f, dtype=tf.int64, default_value=0) else: numeric_col = fc.numeric_column(f, default_value=0) bucketed_col = fc.bucketized_column(numeric_col, boundaries=BUCKET_BOUNDARIES[f]) embedding_col = fc.embedding_column(bucketed_col, feature_table[f].emb_width, combiner='sqrtn') columns.append(embedding_col)
def test_embedding(): tf.set_random_seed(1) #源数据 color_data = { 'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) # categorical_column 要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # tensor 数据源 将数据源表达成tensor color_column_tensor = color_column._get_sparse_tensors(builder) #获取embedding_column; 第一个参数是:categorical_column; 第二个参数是维度 color_embedding_column = feature_column.embedding_column(color_column, 4, combiner='sum') # 转化为tensor input_layer(数据源,column) 连接起数据源和embedding_column color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embedding_column]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print('embeding' + '_' * 40) print(session.run([color_embeding_dense_tensor]))
def input_feature_columns(ngram_vocab, ngram_dimension, ngram_oov=1, ngram_combiner='sum'): ngram_categorial_column = sequence_categorical_column_with_vocabulary_list( key='ngrams', vocabulary_list=ngram_vocab, dtype=tf.string, num_oov_buckets=ngram_oov, ) ngram_embedding_column = core_columns.embedding_column( categorical_column=ngram_categorial_column, dimension=ngram_dimension, combiner=ngram_combiner, ) return [ ngram_embedding_column, contrib_columns.sequence_numeric_column('word_length'), contrib_columns.sequence_numeric_column('is_no_case'), contrib_columns.sequence_numeric_column('is_lower_case'), contrib_columns.sequence_numeric_column('is_upper_case'), contrib_columns.sequence_numeric_column('is_title_case'), contrib_columns.sequence_numeric_column('is_mixed_case'), ]
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string): f = feature_column.categorical_column_with_hash_bucket( feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype) f1 = feature_column.embedding_column(f, 4) feature_layer = tf.keras.layers.DenseFeatures( [f1]) # 把feature_column对象转为tensor return feature_layer
def test_embedding(): tf.set_random_seed(1) # 1. Input features color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) color_embedding = feature_column.embedding_column(color_column, 4, combiner='sum') # 3. Feature tensor color_embedding_dense_tensor = feature_column.input_layer( color_data, [color_embedding]) with tf.Session() as session: # Embedding needs variables (weights) to do the embedding session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embedding' + '_' * 40) print(session.run([color_embedding_dense_tensor]))
def get_item_feature_columns(business_vocab_list, item_type_dict): items_feature_columns = [] bucketized_boundary = {'stars': [2.5, 4]} embedding_size = {"categories": 8, "city": 4} for k, v in business_vocab_list.items(): if k in ['review_count']: col = numeric_column(k, default_value=0, dtype=item_type_dict[k]) elif k in ['stars']: col = bucketized_column( numeric_column(k, default_value=0, dtype=item_type_dict[k]), bucketized_boundary[k]) elif k in ['categories', 'city']: col = embedding_column(categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k]), dimension=embedding_size[k]) else: col = indicator_column( categorical_column_with_vocabulary_list( k, sorted(v), default_value=-1, dtype=item_type_dict[k])) items_feature_columns.append(col) return items_feature_columns
def _make_crossed(self): """Makes crossed features for both Wide or Deep network. Returns: Tuple (crossed columns for Wide, its dimension) """ # Crossed columns f_crossed_for_wide = [] f_crossed_for_deep = [] for to_cross in self.CROSSED: keys = [] bck_size = 1 for (key, bck, bnd) in to_cross: keys.append(self._prepare_for_crossing(key, bck, bnd)) bck_size *= bck # We can't go crazy on the dim for crossed_column so use a min # **0.25 is a rule of thumb for bucket size vs dimension t_crossed = tfc.crossed_column(keys, min(bck_size, 10000)) t_dimension = int(bck_size**0.25) f_crossed_for_wide.append(t_crossed) f_crossed_for_deep.append( tfc.embedding_column(t_crossed, t_dimension)) return f_crossed_for_wide, f_crossed_for_deep
def hash_embedding(self, hash_bucket, embedding_dim, name): cate_feature = feature_column.categorical_column_with_hash_bucket( name, hash_bucket, dtype=tf.string) emb_col = feature_column.embedding_column(cate_feature, dimension=embedding_dim, combiner='mean') ind_col = feature_column.indicator_column(cate_feature) return emb_col, ind_col
def test_weighted_categorical_feature_embedding(): color_data = { 'color': [['R', 'R'], ['G', 'G'], ['B', 'B'], ['G', 'R'], ['G', 'B'], ['B', 'R']], 'weight': [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.2], [0.4, 0.3], [0.4, 0.6]] } # 6行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7, combiner="sum") color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') color_embeding_weighted = feature_column.embedding_column( color_weight_categorical_column, 7, combiner="sum") color_embeding_dense_tensor_2 = feature_column.input_layer( color_data, [color_embeding_weighted]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) color_weighted_tensor = color_weight_categorical_column._get_sparse_tensors( builder) ## is a pair (id_tensor, weight_tensor) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print("color column weight:") print(color_column_tensor.weight_tensor) print("color column weighted categorical, weight:") print(session.run([color_weighted_tensor.id_tensor])) print(session.run([color_weighted_tensor.weight_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor])) print('embeding weighted categorical column') print(session.run([color_embeding_dense_tensor_2]))
def data_preprocessing(self): """ batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다. train_ds = self.df_to_dataset(self.train, batch_size=batch_size) val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size) for feature_batch, label_batch in train_ds.take(1): print('전체 특성:', list(feature_batch.keys())) print('나이 특성의 배치:', feature_batch['age']) print('타깃의 배치:', label_batch) # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다. self.example_batch = next(iter(train_ds))[0] age = feature_column.numeric_column("age") self.demo(age) """ feature_columns = [] # 수치형 열 for header in [ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca' ]: feature_columns.append(feature_column.numeric_column(header)) # 버킷형 열 age = feature_column.numeric_column("age") age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) feature_columns.append(age_buckets) # 범주형 열 thal = feature_column.categorical_column_with_vocabulary_list( 'thal', ['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) # 임베딩 열 thal_embedding = feature_column.embedding_column(thal, dimension=8) feature_columns.append(thal_embedding) # 교차 특성 열 crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature) self.feature_layer = layers.DenseFeatures(feature_columns) batch_size = 32 self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size) self.val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size) self.test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)
def _add_embedding_columns(columns, features, feature_table, vocabulary): for f in features: assert f in feature_table cate_col = fc.categorical_column_with_vocabulary_list( f, vocabulary.vocab[f]) column = fc.embedding_column(cate_col, feature_table[f].emb_width, combiner='sqrtn') columns.append(column)
def embedding_column(key, dimension, vocabulary_size=None, vocabulary_file=None, vocabulary_list=None, num_oov_buckets=0): return feature_column.embedding_column(categorical_column( key, vocabulary_size, vocabulary_list, vocabulary_file, num_oov_buckets), dimension=dimension)
def embeddings_columns(self, coldim_dict): for col_name, dimension in coldim_dict.items(): #embCol = feature_column.categorical_column_with_vocabulary_list(col_name, colunique ) bucket_size = dimension * dimension embCol = feature_column.categorical_column_with_hash_bucket( col_name, hash_bucket_size=bucket_size) embedding = feature_column.embedding_column(embCol, dimension=dimension) self.real_columns[col_name] = embedding return embedding
def _build_census_deep_columns(emb_dim=8, numeric_range=None): feature_columns = [] for col in ALI_DISPLAY_ADS_CONFIG['deep_emb_cols']: feature_columns.append( fc.embedding_column(fc.categorical_column_with_hash_bucket( col, hash_bucket_size=1000 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000), dimension=emb_dim)) for col in ALI_DISPLAY_ADS_CONFIG['deep_bucket_emb_cols']: feature_columns.append( fc.embedding_column(fc.bucketized_column( fc.numeric_column(col), boundaries=list( np.linspace(numeric_range[col][0], numeric_range[col][1], 1000))), dimension=emb_dim)) feat_field_size = len(feature_columns) return feature_columns, feat_field_size
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets): # Pass-through columns transformed = inputs.copy() del transformed['pickup_datetime'] feature_columns = { colname: fc.numeric_column(colname) for colname in NUMERIC_COLS } # Scaling longitude from range [-70, -78] to [0, 1] for lon_col in ['pickup_longitude', 'dropoff_longitude']: transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0, name='scale_{}'.format(lon_col))( inputs[lon_col]) # Scaling latitude from range [37, 45] to [0, 1] for lat_col in ['pickup_latitude', 'dropoff_latitude']: transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0, name='scale_{}'.format(lat_col))( inputs[lat_col]) # Adding Euclidean dist (no need to be accurate: NN will calibrate it) transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([ inputs['pickup_longitude'], inputs['pickup_latitude'], inputs['dropoff_longitude'], inputs['dropoff_latitude'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # hour of day from timestamp of form '2010-02-08 09:17:00+00:00' transformed['hourofday'] = layers.Lambda( lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2), out_type=tf.dtypes.int32), name='hourofday')(inputs['pickup_datetime']) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', num_buckets=24)) latbuckets = np.linspace(0, 1, nbuckets).tolist() lonbuckets = np.linspace(0, 1, nbuckets).tolist() b_plat = fc.bucketized_column(feature_columns['pickup_latitude'], latbuckets) b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'], latbuckets) b_plon = fc.bucketized_column(feature_columns['pickup_longitude'], lonbuckets) b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'], lonbuckets) ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets) dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets) pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4) feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100) return transformed, feature_columns