def test_crossed_column():
    """ crossed column测试 """
    #源数据
    featrues = {
        'price': [['A'], ['B'], ['C']],  # 0,1,2
        'color': [['R'], ['G'], ['B']]  # 0,1,2
    }
    # categorical_column
    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])

    #crossed_column 产生稀疏表示
    p_x_c = feature_column.crossed_column([price, color], 16)

    # 稠密表示
    p_x_c_identy = feature_column.indicator_column(p_x_c)

    # crossed_column 连接 源数据
    p_x_c_identy_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([p_x_c_identy_dense_tensor]))
Пример #2
0
def get_item_feature_columns(business_vocab_list, item_type_dict):

    items_feature_columns = []

    bucketized_boundary = {'stars': [2.5, 4]}
    embedding_size = {"categories": 8, "city": 4}

    for k, v in business_vocab_list.items():

        if k in ['review_count']:
            col = numeric_column(k, default_value=0, dtype=item_type_dict[k])
        elif k in ['stars']:
            col = bucketized_column(
                numeric_column(k, default_value=0, dtype=item_type_dict[k]),
                bucketized_boundary[k])
        elif k in ['categories', 'city']:
            col = embedding_column(categorical_column_with_vocabulary_list(
                k, sorted(v), default_value=-1, dtype=item_type_dict[k]),
                                   dimension=embedding_size[k])
        else:
            col = indicator_column(
                categorical_column_with_vocabulary_list(
                    k, sorted(v), default_value=-1, dtype=item_type_dict[k]))

        items_feature_columns.append(col)

    return items_feature_columns
Пример #3
0
def create_feature_layer() -> tf.keras.layers.DenseFeatures:
    # Feature column for height
    feature_height = feature_column.numeric_column("Groesse")

    # Feature column for weight
    feature_weight = feature_column.numeric_column("Gewicht")

    # Feature column for age
    feature_age = feature_column.numeric_column("Alter")

    # Category column for gender
    feature_gender = feature_column.categorical_column_with_vocabulary_list(
        'Geschlecht', ['w', 'm'])
    feature_gender_one_hot = feature_column.indicator_column(feature_gender)

    # Category column for activities
    feature_activities = feature_column.categorical_column_with_vocabulary_list(
        'Betaetigung', ['keinSport', 'Kraftsport', 'Ausdauersport'])
    feature_activities_one_hot = feature_column.indicator_column(
        feature_activities)

    feature_columns = [
        feature_height, feature_weight, feature_age, feature_gender_one_hot,
        feature_activities_one_hot
    ]

    return tf.keras.layers.DenseFeatures(feature_columns)
Пример #4
0
    def _get_tf_feature_cols(dataframe: pd.DataFrame):
        feature_columns = []

        # numeric cols
        for header in ['PhotoAmt', 'Fee', 'Age']:
            feature_columns.append(feature_column.numeric_column(header))

        # bucketized cols
        age = feature_column.numeric_column('Age')
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[1, 2, 3, 4, 5])
        feature_columns.append(age_buckets)

        # indicator_columns
        indicator_column_names = [
            'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
            'Vaccinated', 'Sterilized', 'Health'
        ]
        for col_name in indicator_column_names:
            categorical_column = feature_column.categorical_column_with_vocabulary_list(
                col_name, dataframe[col_name].unique())
            indicator_column = feature_column.indicator_column(
                categorical_column)
            feature_columns.append(indicator_column)

        # embedding columns
        breed1 = feature_column.categorical_column_with_vocabulary_list(
            'Breed1', dataframe.Breed1.unique())
        breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
        feature_columns.append(breed1_embedding)
        return feature_columns
def define_feature_columns(dataframe):

    print("Defining feature columns...")
    feature_columns = []

    # Create embedding column for name IDs
    name_id = feature_column.categorical_column_with_vocabulary_list(
        'nconst', dataframe.nconst.unique())
    # Dimension set to 30 (approximately fourth root of the number of unique name IDs)
    name_id_embedding = feature_column.embedding_column(name_id, dimension=30)
    feature_columns.append(name_id_embedding)

    # Create indicator columns for category and genres
    indicator_column_names = ['category', 'genres']
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)

    # Create bucketized column for startYear (a.k.a. release date)
    start_year_numeric = feature_column.numeric_column('startYear')
    start_year_bucket = feature_column.bucketized_column(
        start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015])
    feature_columns.append(start_year_bucket)

    print("Feature columns defined")
    return feature_columns
Пример #6
0
def create_feature_columns():
  # user feature
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64)
  bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
  c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
  cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
  sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
  pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

  # item feature
  pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64)

  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
  phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000))
  phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500))
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))

  pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid")
  bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid")
  cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid")
  c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id")
  sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid")
  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution,
             phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  my_feature_columns += pid_embed
  my_feature_columns += sid_embed
  my_feature_columns += bid_embed
  my_feature_columns += cid_embed
  my_feature_columns += c1id_embed
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Пример #7
0
def add_feature_columns(train_df):
    feature_columns = []

    feature_columns.append(tf.feature_column.numeric_column('day_of_year'))
    feature_columns.append(tf.feature_column.numeric_column('hour_of_day'))

    # boundaries_day_of_year = list(np.arange(int(min(train_df['day_of_year'])), int(max(train_df['day_of_year'])), 1))
    # bucketized_day_of_year = tf.feature_column.bucketized_column(tf.feature_column.numeric_column("day_of_year"), boundaries_day_of_year)
    # boundaries_hour = list(np.arange(int(min(train_df['hour'])), int(max(train_df['hour'])), 1))
    # bucketized_hour = tf.feature_column.bucketized_column(tf.feature_column.numeric_column("hour"), boundaries_hour)
    # latitude_x_longitude = tf.feature_column.crossed_column([bucketized_day_of_year, bucketized_hour], hash_bucket_size=2200)
    # crossed_feature = tf.feature_column.indicator_column(latitude_x_longitude)
    # feature_columns.append(crossed_feature)y

    wind = tf.feature_column.numeric_column("wind_speed")
    feature_columns.append(wind)

    wind = tf.feature_column.numeric_column("wind_direction")
    feature_columns.append(wind)

    humidity = tf.feature_column.numeric_column("humidity")
    feature_columns.append(humidity)

    temperature = tf.feature_column.numeric_column("temperature")
    feature_columns.append(temperature)

    temperature = tf.feature_column.numeric_column("increased_traffic")
    feature_columns.append(temperature)

    print(train_df['weather_code'].unique())
    weather_current = tf.feature_column.indicator_column(
        feature_column.categorical_column_with_vocabulary_list(
            "weather_code", train_df['weather_code'].unique()))
    feature_columns.append(weather_current)

    print(train_df['past_weather_code'].unique())
    weather_past = tf.feature_column.indicator_column(
        feature_column.categorical_column_with_vocabulary_list(
            "past_weather_code", train_df['past_weather_code'].unique()))
    feature_columns.append(weather_past)

    # pm10_last = tf.feature_column.numeric_column("pm10_last")
    # feature_columns.append(pm10_last)
    #
    # pm10_last = tf.feature_column.numeric_column("pm25_last")
    # feature_columns.append(pm10_last)
    #
    # pm10_last2 = tf.feature_column.numeric_column("pm10_last2")
    # feature_columns.append(pm10_last2)
    #
    # pm25_last2 = tf.feature_column.numeric_column("pm25_last2")
    # feature_columns.append(pm25_last2)

    return feature_columns
Пример #8
0
def get_feature_columns(dataframe):
    """Creates feature columns from pd.DataFrame."""
    feature_columns = []
    feature_layer_inputs = {}

    # numeric cols
    for col_name in ['PhotoAmt', 'Fee', 'Age']:
        feature_columns.append(feature_column.numeric_column(col_name))
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name)

    # bucketized cols
    age = feature_column.numeric_column('Age')
    age_buckets = feature_column.bucketized_column(age,
                                                   boundaries=[1, 2, 3, 4, 5])
    feature_columns.append(age_buckets)

    # indicator_columns
    indicator_column_names = [
        'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
        'Vaccinated', 'Sterilized', 'Health'
    ]
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name,
                                                        dtype=tf.string)

    # embedding columns
    breed1 = feature_column.categorical_column_with_vocabulary_list(
        'Breed1', dataframe.Breed1.unique())
    breed1_embedding = feature_column.embedding_column(breed1, dimension=16)
    feature_columns.append(breed1_embedding)
    feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ),
                                                    name='Breed1',
                                                    dtype=tf.string)

    # crossed columns
    animal_type = feature_column.categorical_column_with_vocabulary_list(
        'Type', ['Cat', 'Dog'])
    feature_columns.append(feature_column.indicator_column(animal_type))
    age_type_feature = feature_column.crossed_column(
        [age_buckets, animal_type], hash_bucket_size=100)
    feature_columns.append(feature_column.indicator_column(age_type_feature))
    feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ),
                                                  name='Type',
                                                  dtype=tf.string)

    return feature_columns, feature_layer_inputs
Пример #9
0
def create_feature_columns(train_data):
    n_users = train_data.user.nunique()
    users = fc.categorical_column_with_vocabulary_list("user", np.arange(n_users), default_value=-1, dtype=tf.int64)
    gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"])
    age = fc.categorical_column_with_vocabulary_list("age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64)
    occupation = fc.categorical_column_with_vocabulary_list("occupation", np.arange(21), dtype=tf.int64)

    all_feature_cols = [fc.embedding_column(users, 32),
                        fc.indicator_column(gender),
                        fc.embedding_column(age, 32),
                        fc.embedding_column(occupation, 32)]

    return all_feature_cols
Пример #10
0
    def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100):
        """Generate a feature column from a categorical string data set

        Parameters
        ----------
        name : str
            Name of categorical columns
        data : np.ndarray | list
            String data array
        vocab_threshold : int
            Number of unique entries in the data array below which this
            will use a vocabulary list, above which a hash bucket will be used.
        bucket_size : int
            Hash bucket size.

        Returns
        -------
        f_col : IndicatorColumn
            Categorical feature column.
        """

        n_unique = len(set(data))

        if n_unique < vocab_threshold:
            f_col = feature_column.categorical_column_with_vocabulary_list(
                name, list(set(data)))
        else:
            f_col = feature_column.categorical_column_with_hash_bucket(
                name, bucket_size)

        f_col = feature_column.indicator_column(f_col)

        return f_col
Пример #11
0
def test_weighted_categorical_column():
    # 1. Input features
    color_data = {
        'color': [['R'], ['G'], ['B'], ['A']],
        'weight': [[1.0], [2.0], [4.0], [8.0]]
    }
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # 2. Feature columns (Sparse)
    color_weight_categorical_column \
        = feature_column.weighted_categorical_column(color_column, 'weight')
    builder = _LazyBuilder(color_data)
    id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
        builder)

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('weighted categorical' + '-' * 40)
        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))

    # 2. Feature columns (Dense)
    weighted_column = feature_column.indicator_column(
        color_weight_categorical_column)
    # 3. Feature tensor
    weighted_column_dense_tensor = feature_column.input_layer(
        color_data, [weighted_column])
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([weighted_column_dense_tensor]))
Пример #12
0
def test_embedding():
    tf.set_random_seed(1)
    # 1. Input features
    color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    color_embedding = feature_column.embedding_column(color_column,
                                                      4,
                                                      combiner='sum')
    # 3. Feature tensor
    color_embedding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding])

    with tf.Session() as session:
        # Embedding needs variables (weights) to do the embedding
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embedding' + '_' * 40)
        print(session.run([color_embedding_dense_tensor]))
def test_embedding():
    tf.set_random_seed(1)
    #源数据
    color_data = {
        'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]
    }  # 4行样本
    builder = _LazyBuilder(color_data)

    # categorical_column  要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # tensor 数据源  将数据源表达成tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)

    #获取embedding_column; 第一个参数是:categorical_column;  第二个参数是维度
    color_embedding_column = feature_column.embedding_column(color_column,
                                                             4,
                                                             combiner='sum')

    # 转化为tensor  input_layer(数据源,column)  连接起数据源和embedding_column
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding_column])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))
        print('embeding' + '_' * 40)
        print(session.run([color_embeding_dense_tensor]))
Пример #14
0
def make_columns():
  user_id = fc.embedding_column(fc.categorical_column_with_vocabulary_file(
    'user_id', vocabulary_file='user_id', dtype=tf.string, num_oov_buckets=4), dimension=6)

  partner_ids = fc.categorical_column_with_vocabulary_file(
    'reserve_partner_car_type_id', vocabulary_file='partner_car_type_id', dtype=tf.string, num_oov_buckets=1)
  partner_ids_embedding = fc.embedding_column(partner_ids, dimension=3)
  # partner_ids_embedding = fc.indicator_column(partner_ids)

  dayofweek = fc.embedding_column(fc.categorical_column_with_vocabulary_list(
    'dayofweek', [str(d) for d in range(0, 8)], dtype=tf.string, num_oov_buckets=1), dimension=3)
  timeSlice = fc.embedding_column(fc.categorical_column_with_vocabulary_file(
    'timeSlice', vocabulary_file='timeSlice', dtype=tf.string, num_oov_buckets=1), dimension=3)

  sHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file(
    'sHexID', vocabulary_file='sHexID', num_oov_buckets=1), dimension=6)
  eHexID = fc.embedding_column(fc.categorical_column_with_vocabulary_file(
    'eHexID', vocabulary_file='eHexID', num_oov_buckets=1), dimension=6)
  order_columns = [
    fc.numeric_column('dist')
    ]

  user_columns = [fc.numeric_column(c) for c in user_null_columns + user_float32_columns]
  spacetime_columns = [fc.numeric_column(c) for c in int64_columns + float32_columns]

  embedding_columns = [user_id, partner_ids_embedding, dayofweek, timeSlice, sHexID]
  # embedding_columns = [dayofweek]
  # return embedding_columns + order_columns, embedding_columns + order_columns + spacetime_columns
  return embedding_columns, order_columns, spacetime_columns, user_columns
def test_weighted_categorical_column():
    color_data = {
        'color': [['R'], ['G'], ['B'], ['A']],
        'weight': [[1.0], [2.0], [4.0], [8.0]]
    }  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_weight_categorical_column = feature_column.weighted_categorical_column(
        color_column, 'weight')

    builder = _LazyBuilder(color_data)

    with tf.Session() as session:
        id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
            builder)

        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('weighted categorical' + '-' * 40)

        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))
Пример #16
0
 def participant_focus(part_f):
     participant_focus = feature_column.categorical_column_with_vocabulary_list(
         part_f, [
             'engineering', 'sales', 'marketing', 'management',
             'financial', 'other', 'none'
         ])
     return participant_focus
Пример #17
0
def _add_weighted_embedding_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        weighted_column = fc.weighted_categorical_column(
            fc.categorical_column_with_vocabulary_list(f, vocabulary.vocab[f]), f + _WEIGHTED_SUFFIX)
        emb_weighted_column = fc.embedding_column(weighted_column, feature_table[f].emb_width, combiner='sqrtn')
        columns.append(emb_weighted_column)
Пример #18
0
def test_weighted_cate_column():
    # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错
    # !!! 而且weight必须是float型,输入int会报错
    x_values = {
        'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']],
        'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]]
    }
    builder = _LazyBuilder(x_values)  # lazy representation of input

    # ================== define ops
    sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list(
        'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1)
    sparse_featcol = feature_column.weighted_categorical_column(
        categorical_column=sparse_id_featcol, weight_feature_key='weight')
    x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder)

    # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens]
    # 其中的权重是这个token出现的所有权重的总和
    dense_featcol = feature_column.indicator_column(sparse_featcol)
    x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol])

    # ================== run
    with tf.Session() as sess:
        # 必须initialize table,否则报错
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())

        id_sparse_value, weight_sparse_value = sess.run(
            [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor])

        print("************************* sparse id tensor")
        # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 0, -1,  0,  2,  1, -1,  1]), dense_shape=array([2, 4]))
        print(id_sparse_value)

        print("************************* sparse weight tensor")
        # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 1.,  2., -3.,  4.,  5.,  7., -8.], dtype=float32), dense_shape=array([2, 4]))
        print(weight_sparse_value)

        print("************************* dense MHE tensor")
        # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab]
        # 其中的每个数值是该token出现的所有权重的总和
        # [[-2.  0.  4.]
        #  [ 0. -3.  0.]]
        print(sess.run(x_dense_tensor))
def build_feature_layer():
    feature_columns = []

    report_id = feature_column.categorical_column_with_vocabulary_list('report_id', [1, 2, 3, 4, 5])
    report_id_one_hot = feature_column.indicator_column(report_id)
    feature_columns.append(report_id_one_hot)

    feature_columns.append(feature_column.numeric_column('report_params'))

    day_part = feature_column.categorical_column_with_vocabulary_list('day_part', [1, 2, 3])
    day_part_one_hot = feature_column.indicator_column(day_part)
    feature_columns.append(day_part_one_hot)
    
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    
    return feature_layer
Пример #20
0
    def transform(self, output_tensors):
        input_tensor_name = self.parameters.get("input_tensor")
        output_tensor_name = self.parameters.get("output_tensor")
        if self.parameters.has_key("vocabulary_list"):
            vocabulary_list = self.parameters.get("vocabulary_list")
            if isinstance(vocabulary_list,list):
                vocabulary_list = tuple(vocabulary_list)
            if isinstance(vocabulary_list,basestring):
                vocabulary_list = tuple(vocabulary_list.split(','))

        else:
            msg = "parameters error, sparse_column_with_keys must need keys"
            logger.error(msg)
            raise ParametersError(msg)


        # combiner = self.parameters.get("combiner", 'sum')
        dtype = self.get_value_tf_type("dtype") if self.get_value_tf_type("dtype") != None else tf.string
        # default_value = self.parameters.get("default_value",None)
        num_oov_buckets = 1

        output_tensors[output_tensor_name] = fc.categorical_column_with_vocabulary_list(
            key = input_tensor_name,
            vocabulary_list = vocabulary_list,
            dtype=dtype,
            # default_value=default_value,
            num_oov_buckets=num_oov_buckets
        )
Пример #21
0
def test_linear_model():

    featrues = {
        'price': [[1.0], [5.0], [10.0]],
        'color': [['R'], ['G'], ['B']]
    }

    price_column = feature_column.numeric_column('price')
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])
    prediction = feature_column.linear_model(featrues,
                                             [price_column, color_column])

    bias = get_linear_model_bias()
    price_var = get_linear_model_column_var(price_column)
    color_var = get_linear_model_column_var(color_column)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        sess.run(bias.assign([7.0]))
        sess.run(price_var.assign([[10.0]]))
        sess.run(color_var.assign([[2.0], [2.0], [2.0]]))

        predication_result = sess.run([prediction])

        print(predication_result)
Пример #22
0
def create_feature_columns():
  # user feature
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneBrand = fc.embedding_column(phoneBrandId, 20)
  phoneResolution = fc.embedding_column(phoneResolutionId, 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand,
                        phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Пример #23
0
def get_unique_categories_and_append(key):
    col = df[key]
    arr = col.to_numpy()
    unique_arr = np.unique(arr)
    feat_col = feature_column.categorical_column_with_vocabulary_list(key, unique_arr)
    one_hot = feature_column.indicator_column(feat_col)
    feature_columns.append(one_hot)
Пример #24
0
def test_multi_value_embedding():
    color_data = {
        'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'],
                  ['B', 'R']]
    }

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_embeding = feature_column.embedding_column(color_column, 7)
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embeding])
    builder = _LazyBuilder(color_data)
    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embeding' + '-' * 40)
        print(session.run([color_embeding_dense_tensor]))
Пример #25
0
def test_categorical_column_with_vocabulary_list():

    color_data = {
        'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]
    }  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
Пример #26
0
def build_feature_columns():
    age = feature_column.numeric_column('age')
    age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass',
                ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 
                'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']))
    fnlwgt = feature_column.numeric_column('fnlwgt')
    education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education',
                ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
                 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',
                 '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']))
    education_num = feature_column.numeric_column('education_num')
    marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status',
                ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                 'Married-spouse-absent', 'Married-AF-spouse']))
    occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation',
                ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
                 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 
                 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 
                 'Armed-Forces']))
    relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship',
                ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']))
    race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', 
                ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']))
    gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', 
                ['Female', 'Male']))    
    capital_gain = feature_column.numeric_column('capital_gain') 
    capital_loss = feature_column.numeric_column('capital_loss')
    hours_per_week = feature_column.numeric_column('hours_per_week')
    native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country',
                ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 
                 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam',
                 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador',
                 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland',
                 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 
                 'Holand-Netherlands']))
    wide = [age, workclass]    
    deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country]
    race_gender = feature_column.indicator_column(feature_column.crossed_column([
        feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']),
        feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10))

    wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender]    
    deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country]
    return (wide, deep)
Пример #27
0
    def data_preprocessing(self):
        """
        batch_size = 5  # 예제를 위해 작은 배치 크기를 사용합니다.
        train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size)
        test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)

        for feature_batch, label_batch in train_ds.take(1):
            print('전체 특성:', list(feature_batch.keys()))
            print('나이 특성의 배치:', feature_batch['age'])
            print('타깃의 배치:', label_batch)

        # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
        self.example_batch = next(iter(train_ds))[0]

        age = feature_column.numeric_column("age")
        self.demo(age)
        """
        feature_columns = []

        # 수치형 열
        for header in [
                'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
        ]:
            feature_columns.append(feature_column.numeric_column(header))

        # 버킷형 열
        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
        feature_columns.append(age_buckets)

        # 범주형 열
        thal = feature_column.categorical_column_with_vocabulary_list(
            'thal', ['fixed', 'normal', 'reversible'])
        thal_one_hot = feature_column.indicator_column(thal)
        feature_columns.append(thal_one_hot)

        # 임베딩 열
        thal_embedding = feature_column.embedding_column(thal, dimension=8)
        feature_columns.append(thal_embedding)

        # 교차 특성 열
        crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                        hash_bucket_size=1000)
        crossed_feature = feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature)

        self.feature_layer = layers.DenseFeatures(feature_columns)

        batch_size = 32
        self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        self.val_ds = self.df_to_dataset(self.val,
                                         shuffle=False,
                                         batch_size=batch_size)
        self.test_ds = self.df_to_dataset(self.test,
                                          shuffle=False,
                                          batch_size=batch_size)
Пример #28
0
def create_feature_layer(df):
    week = feature_column.numeric_column("Week")
    boundaries = []
    for i in range(1, 53):
        boundaries.append(i)
    week = feature_column.bucketized_column(week, boundaries=boundaries)
    day = feature_column.numeric_column("Day")
    boundaries = []
    for i in range(1, 8):
        boundaries.append(i)
    day = feature_column.bucketized_column(day, boundaries=boundaries)
    year = feature_column.numeric_column("Year")
    boundaries = []
    for i in range(2013, 2017):
        boundaries.append(i)
    year = feature_column.bucketized_column(year, boundaries=boundaries)
    hour = feature_column.numeric_column("std_hour")
    boundaries = []
    for i in range(0, 24):
        boundaries.append(i)
    hour = feature_column.bucketized_column(hour, boundaries=boundaries)
    arrival = feature_column.categorical_column_with_vocabulary_list(
        "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist())
    airline = feature_column.categorical_column_with_vocabulary_list(
        "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist())
    flight_no = feature_column.categorical_column_with_vocabulary_list(
        "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist())
    arrival_one_hot = feature_column.indicator_column(arrival)
    airline_one_hot = feature_column.indicator_column(airline)
    flight_no_one_hot = feature_column.indicator_column(flight_no)
    arrival_length = len(pd.Series.unique(df.Arrival).tolist())
    arrival_and_week = feature_column.crossed_column(
        [arrival, week], hash_bucket_size=(arrival_length * 52))
    arrival_and_week = feature_column.indicator_column(arrival_and_week)
    airline_length = len(pd.Series.unique(df.Airline).tolist())
    year_and_airline = feature_column.crossed_column(
        [year, airline], hash_bucket_size=(airline_length * 4))
    year_and_airline = feature_column.indicator_column(year_and_airline)
    feature_columns = []
    feature_columns = feature_columns + [
        week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour,
        arrival_and_week, year, year_and_airline
    ]
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    return feature_layer
Пример #29
0
def _add_embedding_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        cate_col = fc.categorical_column_with_vocabulary_list(
            f, vocabulary.vocab[f])
        column = fc.embedding_column(cate_col,
                                     feature_table[f].emb_width,
                                     combiner='sqrtn')
        columns.append(column)
Пример #30
0
def test_crossed_column():
    featrues = {
        'price': [['A', 'A'], ['B', 'D'], ['C', 'A']],
        'color': [['R', 'R'], ['G', 'G'], ['B', 'B']]
    }

    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])
    p_x_c = feature_column.crossed_column([price, color], 16)
    p_x_c_identy = feature_column.indicator_column(p_x_c)  # what's this?
    p_x_c_identy_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([p_x_c_identy_dense_tensor]))