def _build_census_wide_columns(numeric_range=None):
     base_columns, cross_columns = [], []
     for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']:
         base_columns.append(
             fc.indicator_column(
                 fc.categorical_column_with_hash_bucket(
                     col,
                     hash_bucket_size=1000 if
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000)))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']:
         base_columns.append(
             fc.bucketized_column(fc.numeric_column(col),
                                  boundaries=list(
                                      np.linspace(numeric_range[col][0],
                                                  numeric_range[col][1],
                                                  1000))))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']:
         cross_columns.append(
             fc.indicator_column(
                 fc.crossed_column([col[0], col[1]],
                                   hash_bucket_size=10000)))
     feature_columns = base_columns + cross_columns
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
示例#2
0
def create_feature_layer() -> tf.keras.layers.DenseFeatures:
    # Feature column for height
    feature_height = feature_column.numeric_column("Groesse")

    # Feature column for weight
    feature_weight = feature_column.numeric_column("Gewicht")

    # Feature column for age
    feature_age = feature_column.numeric_column("Alter")

    # Category column for gender
    feature_gender = feature_column.categorical_column_with_vocabulary_list(
        'Geschlecht', ['w', 'm'])
    feature_gender_one_hot = feature_column.indicator_column(feature_gender)

    # Category column for activities
    feature_activities = feature_column.categorical_column_with_vocabulary_list(
        'Betaetigung', ['keinSport', 'Kraftsport', 'Ausdauersport'])
    feature_activities_one_hot = feature_column.indicator_column(
        feature_activities)

    feature_columns = [
        feature_height, feature_weight, feature_age, feature_gender_one_hot,
        feature_activities_one_hot
    ]

    return tf.keras.layers.DenseFeatures(feature_columns)
示例#3
0
def transform(inputs, num_cols, cat_cols):
    print("Inputs before features transformation: {}".format(inputs.keys()))

    # Pass-through columns
    transformed = inputs.copy()

    feature_columns = {
        colname: tf.feature_column.numeric_column(colname)
        for colname in num_cols
    }

    # Add Euclidean distance
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickuplon'], inputs['pickuplat'], inputs['dropofflon'],
        inputs['dropofflat']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # Shift 'dayofweek' feature to a value range of 0-6

    transformed['dayofweek'] = transformed['dayofweek'] - 1

    # Create categorical columns (wrapped in indicator columns)

    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', 24))
    feature_columns['dayofweek'] = fc.indicator_column(
        fc.categorical_column_with_identity('dayofweek', 7))

    print("Transformed features: {}".format(transformed.keys()))
    print("Feature columns: {}".format(feature_columns.keys()))
    return transformed, feature_columns
示例#4
0
    def data_preprocessing(self):
        """
        batch_size = 5  # 예제를 위해 작은 배치 크기를 사용합니다.
        train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size)
        test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)

        for feature_batch, label_batch in train_ds.take(1):
            print('전체 특성:', list(feature_batch.keys()))
            print('나이 특성의 배치:', feature_batch['age'])
            print('타깃의 배치:', label_batch)

        # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
        self.example_batch = next(iter(train_ds))[0]

        age = feature_column.numeric_column("age")
        self.demo(age)
        """
        feature_columns = []

        # 수치형 열
        for header in [
                'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
        ]:
            feature_columns.append(feature_column.numeric_column(header))

        # 버킷형 열
        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
        feature_columns.append(age_buckets)

        # 범주형 열
        thal = feature_column.categorical_column_with_vocabulary_list(
            'thal', ['fixed', 'normal', 'reversible'])
        thal_one_hot = feature_column.indicator_column(thal)
        feature_columns.append(thal_one_hot)

        # 임베딩 열
        thal_embedding = feature_column.embedding_column(thal, dimension=8)
        feature_columns.append(thal_embedding)

        # 교차 특성 열
        crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                        hash_bucket_size=1000)
        crossed_feature = feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature)

        self.feature_layer = layers.DenseFeatures(feature_columns)

        batch_size = 32
        self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        self.val_ds = self.df_to_dataset(self.val,
                                         shuffle=False,
                                         batch_size=batch_size)
        self.test_ds = self.df_to_dataset(self.test,
                                          shuffle=False,
                                          batch_size=batch_size)
示例#5
0
文件: esmm.py 项目: lh0730/esmm-1
def create_feature_columns():
  # user feature
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64)
  bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
  c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
  cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
  sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
  pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

  # item feature
  pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64)

  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
  phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000))
  phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500))
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))

  pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid")
  bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid")
  cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid")
  c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id")
  sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid")
  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution,
             phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  my_feature_columns += pid_embed
  my_feature_columns += sid_embed
  my_feature_columns += bid_embed
  my_feature_columns += cid_embed
  my_feature_columns += c1id_embed
  print("feature columns:", my_feature_columns)
  return my_feature_columns
def get_feature_columns(dataframe):
    """Creates feature columns from pd.DataFrame."""
    feature_columns = []
    feature_layer_inputs = {}

    # numeric cols
    for col_name in ['PhotoAmt', 'Fee', 'Age']:
        feature_columns.append(feature_column.numeric_column(col_name))
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name)

    # bucketized cols
    age = feature_column.numeric_column('Age')
    age_buckets = feature_column.bucketized_column(age,
                                                   boundaries=[1, 2, 3, 4, 5])
    feature_columns.append(age_buckets)

    # indicator_columns
    indicator_column_names = [
        'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
        'Vaccinated', 'Sterilized', 'Health'
    ]
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name,
                                                        dtype=tf.string)

    # embedding columns
    breed1 = feature_column.categorical_column_with_vocabulary_list(
        'Breed1', dataframe.Breed1.unique())
    breed1_embedding = feature_column.embedding_column(breed1, dimension=16)
    feature_columns.append(breed1_embedding)
    feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ),
                                                    name='Breed1',
                                                    dtype=tf.string)

    # crossed columns
    animal_type = feature_column.categorical_column_with_vocabulary_list(
        'Type', ['Cat', 'Dog'])
    feature_columns.append(feature_column.indicator_column(animal_type))
    age_type_feature = feature_column.crossed_column(
        [age_buckets, animal_type], hash_bucket_size=100)
    feature_columns.append(feature_column.indicator_column(age_type_feature))
    feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ),
                                                  name='Type',
                                                  dtype=tf.string)

    return feature_columns, feature_layer_inputs
示例#7
0
def classify_data(batch_size=5):
    from tensorflow import feature_column
    from tensorflow.keras import layers
    from sklearn.model_selection import train_test_split
    URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
    dataframe = pd.read_csv(URL)
    tr, te = train_test_split(dataframe, test_size=0.2)
    tr, va = train_test_split(tr, test_size=0.2)
    print(len(tr), len(va), len(te))

    def df_to_dataset(dataframe, shuffle=True, batch_size=32):
        dataframe, labels = dataframe.copy(), dataframe.pop('target')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size)
        return ds

    tr_ds = df_to_dataset(tr, batch_size=batch_size)
    va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size)
    te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size)
    feature_columns = []
    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))
    age = feature_column.numeric_column('age')
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)
    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    feature_columns.append(feature_column.indicator_column(thal))
    feature_columns.append(feature_column.embedding_column(thal, dimension=8))
    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    feature_columns.append(feature_column.indicator_column(crossed_feature))
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(tr_ds, validation_data=va_ds, epochs=5)
    loss, accuracy = model.evaluate(te_ds)
    print(accuracy)
示例#8
0
def create_user_feature_columns():
  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
  city_id = fc.categorical_column_with_hash_bucket("city", 700)
  city = fc.shared_embedding_columns([city_id], 16)
  cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
  return cols + city
示例#9
0
def build_model_columns(embedding_size):
    linear_feature_columns = []
    embedding_feature_columns = []

    u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64)
    u_id_embedded = feature_column.embedding_column(u_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(u_id))
    embedding_feature_columns.append(u_id_embedded)

    i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64)
    i_id_embedded = feature_column.embedding_column(i_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(i_id))
    embedding_feature_columns.append(i_id_embedded)

    return linear_feature_columns, embedding_feature_columns
示例#10
0
def build_features(statistics):
    pu_location_id = fc.categorical_column_with_identity(key='PULocationID',
                                                         num_buckets=265)
    do_location_id = fc.categorical_column_with_identity(key='DOLocationID',
                                                         num_buckets=265)
    day_of_week = fc.categorical_column_with_identity(key='day_of_week',
                                                      num_buckets=7)
    weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2)
    speed_buckets = fc.bucketized_column(
        fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70])
    distance_buckets = fc.bucketized_column(
        fc.numeric_column('trip_distance'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    duration_buckets = fc.bucketized_column(
        fc.numeric_column('duration'),
        boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])
    fare_buckets = fc.bucketized_column(
        fc.numeric_column('fare_amount'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    passenger_buckets = fc.bucketized_column(
        fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9])
    location = fc.crossed_column([pu_location_id, do_location_id],
                                 hash_bucket_size=1000)
    cross_all = fc.crossed_column([
        location, speed_buckets, distance_buckets, duration_buckets,
        fare_buckets, passenger_buckets
    ],
                                  hash_bucket_size=1000)
    categorical_columns = [
        fc.embedding_column(pu_location_id, dimension=32),
        fc.embedding_column(do_location_id, dimension=32),
        fc.indicator_column(day_of_week),
        fc.indicator_column(weekend)
    ]
    numeric_columns = [
        custom_numeric_column('passenger_count', statistics),
        custom_numeric_column('trip_distance', statistics),
        custom_numeric_column('fare_amount', statistics),
        custom_numeric_column('extra', statistics),
        custom_numeric_column('mta_tax', statistics),
        custom_numeric_column('tolls_amount', statistics),
        custom_numeric_column('improvement_surcharge', statistics),
        custom_numeric_column('duration', statistics),
        custom_numeric_column('speed', statistics)
    ]
    dnn_feature_columns = numeric_columns + categorical_columns
    linear_feature_columns = [location, cross_all]
    return dnn_feature_columns, linear_feature_columns
示例#11
0
def test_categorical_column_with_hash_bucket():
    # 1. Input features
    color_data = {'color': [[2], [5], [-1], [0]]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    # Convert the Categorical Column to Dense Column
    color_column_identity = feature_column.indicator_column(color_column)
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identity])

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
示例#12
0
def test_categorical_column_with_vocabulary_list():

    color_data = {
        'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]
    }  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
def build_feature_layer():
    feature_columns = []

    report_id = feature_column.categorical_column_with_vocabulary_list('report_id', [1, 2, 3, 4, 5])
    report_id_one_hot = feature_column.indicator_column(report_id)
    feature_columns.append(report_id_one_hot)

    feature_columns.append(feature_column.numeric_column('report_params'))

    day_part = feature_column.categorical_column_with_vocabulary_list('day_part', [1, 2, 3])
    day_part_one_hot = feature_column.indicator_column(day_part)
    feature_columns.append(day_part_one_hot)
    
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    
    return feature_layer
def test_crossed_column():
    """ crossed column测试 """
    #源数据
    featrues = {
        'price': [['A'], ['B'], ['C']],  # 0,1,2
        'color': [['R'], ['G'], ['B']]  # 0,1,2
    }
    # categorical_column
    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])

    #crossed_column 产生稀疏表示
    p_x_c = feature_column.crossed_column([price, color], 16)

    # 稠密表示
    p_x_c_identy = feature_column.indicator_column(p_x_c)

    # crossed_column 连接 源数据
    p_x_c_identy_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([p_x_c_identy_dense_tensor]))
示例#15
0
def test_weighted_cate_column():
    # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错
    # !!! 而且weight必须是float型,输入int会报错
    x_values = {
        'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']],
        'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]]
    }
    builder = _LazyBuilder(x_values)  # lazy representation of input

    # ================== define ops
    sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list(
        'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1)
    sparse_featcol = feature_column.weighted_categorical_column(
        categorical_column=sparse_id_featcol, weight_feature_key='weight')
    x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder)

    # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens]
    # 其中的权重是这个token出现的所有权重的总和
    dense_featcol = feature_column.indicator_column(sparse_featcol)
    x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol])

    # ================== run
    with tf.Session() as sess:
        # 必须initialize table,否则报错
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())

        id_sparse_value, weight_sparse_value = sess.run(
            [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor])

        print("************************* sparse id tensor")
        # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 0, -1,  0,  2,  1, -1,  1]), dense_shape=array([2, 4]))
        print(id_sparse_value)

        print("************************* sparse weight tensor")
        # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 1.,  2., -3.,  4.,  5.,  7., -8.], dtype=float32), dense_shape=array([2, 4]))
        print(weight_sparse_value)

        print("************************* dense MHE tensor")
        # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab]
        # 其中的每个数值是该token出现的所有权重的总和
        # [[-2.  0.  4.]
        #  [ 0. -3.  0.]]
        print(sess.run(x_dense_tensor))
示例#16
0
def get_unique_categories_and_append(key):
    col = df[key]
    arr = col.to_numpy()
    unique_arr = np.unique(arr)
    feat_col = feature_column.categorical_column_with_vocabulary_list(key, unique_arr)
    one_hot = feature_column.indicator_column(feat_col)
    feature_columns.append(one_hot)
示例#17
0
    def tf_inputs_dataframe(self, batch_size=1, buffer_size=1000):
        dataframe = read_csv(
            os.path.join(os.path.dirname(self.json_filename),
                         self.data_description["csv"]))
        labels_name = 'ga_edd'
        y_name = labels_name

        for column_name in dataframe.columns:
            if column_name.startswith('_'):
                dataframe.pop(column_name)

        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            r = max(dataframe[header]) - min(dataframe[header])
            dataframe[header] = (dataframe[header] -
                                 min(dataframe[header])) / r

        dataframe = dataframe[(dataframe[y_name] != '.')
                              & (notna(dataframe[y_name])) &
                              (notnull(dataframe[y_name]))].copy()
        dataframe = dataframe.astype({y_name: 'int32'})

        feature_columns = []
        feature_names = []
        num_channels = 0
        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            feature_columns.append(feature_column.numeric_column(header))
            feature_names.append(header)
            num_channels += 1

        num_identity = 2
        for header in [
                'hiv', 'current_smoker', 'former_smoker', 'chronic_htn',
                'preg_induced_htn', 'diabetes', 'gest_diabetes'
        ]:
            col = feature_column.categorical_column_with_identity(
                header, num_identity)
            col = feature_column.indicator_column(col)
            feature_columns.append(col)
            feature_names.append(header)
            num_channels += num_identity

        self.num_channels = num_channels

        feature_layer = tf.keras.layers.DenseFeatures(
            feature_columns=feature_columns)
        dataframe = dataframe.copy()
        labels = dataframe.pop(labels_name)

        dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        dataset = dataset.shuffle(buffer_size=buffer_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.map(lambda x, y: (feature_layer(x), y))

        return dataset
示例#18
0
def get_item_feature_columns(business_vocab_list, item_type_dict):

    items_feature_columns = []

    bucketized_boundary = {'stars': [2.5, 4]}
    embedding_size = {"categories": 8, "city": 4}

    for k, v in business_vocab_list.items():

        if k in ['review_count']:
            col = numeric_column(k, default_value=0, dtype=item_type_dict[k])
        elif k in ['stars']:
            col = bucketized_column(
                numeric_column(k, default_value=0, dtype=item_type_dict[k]),
                bucketized_boundary[k])
        elif k in ['categories', 'city']:
            col = embedding_column(categorical_column_with_vocabulary_list(
                k, sorted(v), default_value=-1, dtype=item_type_dict[k]),
                                   dimension=embedding_size[k])
        else:
            col = indicator_column(
                categorical_column_with_vocabulary_list(
                    k, sorted(v), default_value=-1, dtype=item_type_dict[k]))

        items_feature_columns.append(col)

    return items_feature_columns
示例#19
0
    def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100):
        """Generate a feature column from a categorical string data set

        Parameters
        ----------
        name : str
            Name of categorical columns
        data : np.ndarray | list
            String data array
        vocab_threshold : int
            Number of unique entries in the data array below which this
            will use a vocabulary list, above which a hash bucket will be used.
        bucket_size : int
            Hash bucket size.

        Returns
        -------
        f_col : IndicatorColumn
            Categorical feature column.
        """

        n_unique = len(set(data))

        if n_unique < vocab_threshold:
            f_col = feature_column.categorical_column_with_vocabulary_list(
                name, list(set(data)))
        else:
            f_col = feature_column.categorical_column_with_hash_bucket(
                name, bucket_size)

        f_col = feature_column.indicator_column(f_col)

        return f_col
def define_feature_columns(dataframe):

    print("Defining feature columns...")
    feature_columns = []

    # Create embedding column for name IDs
    name_id = feature_column.categorical_column_with_vocabulary_list(
        'nconst', dataframe.nconst.unique())
    # Dimension set to 30 (approximately fourth root of the number of unique name IDs)
    name_id_embedding = feature_column.embedding_column(name_id, dimension=30)
    feature_columns.append(name_id_embedding)

    # Create indicator columns for category and genres
    indicator_column_names = ['category', 'genres']
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)

    # Create bucketized column for startYear (a.k.a. release date)
    start_year_numeric = feature_column.numeric_column('startYear')
    start_year_bucket = feature_column.bucketized_column(
        start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015])
    feature_columns.append(start_year_bucket)

    print("Feature columns defined")
    return feature_columns
示例#21
0
def test_weighted_categorical_column():
    # 1. Input features
    color_data = {
        'color': [['R'], ['G'], ['B'], ['A']],
        'weight': [[1.0], [2.0], [4.0], [8.0]]
    }
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # 2. Feature columns (Sparse)
    color_weight_categorical_column \
        = feature_column.weighted_categorical_column(color_column, 'weight')
    builder = _LazyBuilder(color_data)
    id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
        builder)

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('weighted categorical' + '-' * 40)
        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))

    # 2. Feature columns (Dense)
    weighted_column = feature_column.indicator_column(
        color_weight_categorical_column)
    # 3. Feature tensor
    weighted_column_dense_tensor = feature_column.input_layer(
        color_data, [weighted_column])
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([weighted_column_dense_tensor]))
def test_categorical_column_with_hash_bucket():
    #源数据
    color_data = {'color': [[2], [5], [-1], [0]]}  # 4行样本 shape=[4,1]
    builder = _LazyBuilder(color_data)

    # categorical_column
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)

    # tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)  #稀疏表示
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    #input_layer连接数据源和声明的column生成新的tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
示例#23
0
    def _get_tf_feature_cols(dataframe: pd.DataFrame):
        feature_columns = []

        # numeric cols
        for header in ['PhotoAmt', 'Fee', 'Age']:
            feature_columns.append(feature_column.numeric_column(header))

        # bucketized cols
        age = feature_column.numeric_column('Age')
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[1, 2, 3, 4, 5])
        feature_columns.append(age_buckets)

        # indicator_columns
        indicator_column_names = [
            'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
            'Vaccinated', 'Sterilized', 'Health'
        ]
        for col_name in indicator_column_names:
            categorical_column = feature_column.categorical_column_with_vocabulary_list(
                col_name, dataframe[col_name].unique())
            indicator_column = feature_column.indicator_column(
                categorical_column)
            feature_columns.append(indicator_column)

        # embedding columns
        breed1 = feature_column.categorical_column_with_vocabulary_list(
            'Breed1', dataframe.Breed1.unique())
        breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
        feature_columns.append(breed1_embedding)
        return feature_columns
示例#24
0
 def hash_embedding(self, hash_bucket, embedding_dim, name):
     cate_feature = feature_column.categorical_column_with_hash_bucket(
         name, hash_bucket, dtype=tf.string)
     emb_col = feature_column.embedding_column(cate_feature,
                                               dimension=embedding_dim,
                                               combiner='mean')
     ind_col = feature_column.indicator_column(cate_feature)
     return emb_col, ind_col
示例#25
0
 def hashed_columns(self, hashed_columns_dict):
     ### Independance
     for col_name, bucket_size in hashed_columns_dict.items():
         hashedCol = feature_column.categorical_column_with_hash_bucket(
             col_name, hash_bucket_size=bucket_size)
         hashedFeature = feature_column.indicator_column(hashedCol)
         self.sparse_columns[col_name] = hashedFeature
     return hashedFeature
示例#26
0
def create_feature_columns():
  # user feature
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneBrand = fc.embedding_column(phoneBrandId, 20)
  phoneResolution = fc.embedding_column(phoneResolutionId, 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand,
                        phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("feature columns:", my_feature_columns)
  return my_feature_columns
示例#27
0
def create_feature_layer(df):
    week = feature_column.numeric_column("Week")
    boundaries = []
    for i in range(1, 53):
        boundaries.append(i)
    week = feature_column.bucketized_column(week, boundaries=boundaries)
    day = feature_column.numeric_column("Day")
    boundaries = []
    for i in range(1, 8):
        boundaries.append(i)
    day = feature_column.bucketized_column(day, boundaries=boundaries)
    year = feature_column.numeric_column("Year")
    boundaries = []
    for i in range(2013, 2017):
        boundaries.append(i)
    year = feature_column.bucketized_column(year, boundaries=boundaries)
    hour = feature_column.numeric_column("std_hour")
    boundaries = []
    for i in range(0, 24):
        boundaries.append(i)
    hour = feature_column.bucketized_column(hour, boundaries=boundaries)
    arrival = feature_column.categorical_column_with_vocabulary_list(
        "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist())
    airline = feature_column.categorical_column_with_vocabulary_list(
        "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist())
    flight_no = feature_column.categorical_column_with_vocabulary_list(
        "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist())
    arrival_one_hot = feature_column.indicator_column(arrival)
    airline_one_hot = feature_column.indicator_column(airline)
    flight_no_one_hot = feature_column.indicator_column(flight_no)
    arrival_length = len(pd.Series.unique(df.Arrival).tolist())
    arrival_and_week = feature_column.crossed_column(
        [arrival, week], hash_bucket_size=(arrival_length * 52))
    arrival_and_week = feature_column.indicator_column(arrival_and_week)
    airline_length = len(pd.Series.unique(df.Airline).tolist())
    year_and_airline = feature_column.crossed_column(
        [year, airline], hash_bucket_size=(airline_length * 4))
    year_and_airline = feature_column.indicator_column(year_and_airline)
    feature_columns = []
    feature_columns = feature_columns + [
        week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour,
        arrival_and_week, year, year_and_airline
    ]
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    return feature_layer
示例#28
0
  def _base():
    education_num=fc.numeric_column('education_num')
    capital_gain=fc.numeric_column('capital_gain')
    capital_loss=fc.numeric_column('capital_loss')
    hours_per_week=fc.numeric_column('hours_per_week')

    #categorical,embedding_column
    relationship=fc.categorical_column_with_vocabulary_file('relationship',vocabulary_file='data/relationship')
    relationship=fc.indicator_column(relationship)

    education=fc.categorical_column_with_vocabulary_file('education',vocabulary_file='data/education')
    education=fc.indicator_column(education)

    race=fc.categorical_column_with_vocabulary_file('race',vocabulary_file='data/race')
    race=fc.indicator_column(race)

    occupation=fc.indicator_column(fc.categorical_column_with_hash_bucket('occupation',20))
    return [education_num,capital_gain,capital_loss,hours_per_week,relationship,education,race,occupation]
示例#29
0
 def crossed_feature_columns(self,
                             columns_crossed,
                             nameOfLayer,
                             bucket_size=10):
     crossed_feature = feature_column.crossed_column(
         columns_crossed, hash_bucket_size=bucket_size)
     crossed_feature = feature_column.indicator_column(crossed_feature)
     self.sparse_columns[nameOfLayer] = crossed_feature
     return crossed_feature
示例#30
0
def create_deep_feature_columns():
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))

  columns = [matchScore, matchType, triggerNum, triggerRank, sceneType,
             phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("deep feature columns:", columns)
  return columns