def get_feature_columns():
    '''
    获取特征列
    '''
    dnn_feature_columns = list()
    linear_feature_columns = list()
    # DNN features
    user_cate = fc.categorical_column_with_hash_bucket("userid", 40000, tf.int64)
    feed_cate = fc.categorical_column_with_hash_bucket("feedid", 240000, tf.int64)
    author_cate = fc.categorical_column_with_hash_bucket("authorid", 40000, tf.int64)
    bgm_singer_cate = fc.categorical_column_with_hash_bucket("bgm_singer_id", 40000, tf.int64)
    bgm_song_cate = fc.categorical_column_with_hash_bucket("bgm_song_id", 60000, tf.int64)
    user_embedding = fc.embedding_column(user_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    feed_embedding = fc.embedding_column(feed_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    author_embedding = fc.embedding_column(author_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    bgm_singer_embedding = fc.embedding_column(bgm_singer_cate, FLAGS.embed_dim)
    bgm_song_embedding = fc.embedding_column(bgm_song_cate, FLAGS.embed_dim)
    dnn_feature_columns.append(user_embedding)
    dnn_feature_columns.append(feed_embedding)
    dnn_feature_columns.append(author_embedding)
    dnn_feature_columns.append(bgm_singer_embedding)
    dnn_feature_columns.append(bgm_song_embedding)
    # Linear features
    video_seconds = fc.numeric_column("videoplayseconds", default_value=0.0)
    device = fc.numeric_column("device", default_value=0.0)
    linear_feature_columns.append(video_seconds)
    linear_feature_columns.append(device)
    # 行为统计特征
    for b in FEA_COLUMN_LIST:
        feed_b = fc.numeric_column(b+"sum", default_value=0.0)
        linear_feature_columns.append(feed_b)
        user_b = fc.numeric_column(b+"sum_user", default_value=0.0)
        linear_feature_columns.append(user_b)
    return dnn_feature_columns, linear_feature_columns
Exemplo n.º 2
0
    def create_features_columns(self):
        userID = fc.embedding_column(
            fc.categorical_column_with_hash_bucket(
                key="userID",
                hash_bucket_size=FLAGS.user_did_size,
                dtype=tf.int64),
            dimension=FLAGS.embed_size,
            initializer=tf.uniform_unit_scaling_initializer(factor=1e-5,
                                                            seed=1,
                                                            dtype=tf.float32))
        itemID = fc.embedding_column(
            fc.categorical_column_with_hash_bucket(
                key="itemID",
                hash_bucket_size=FLAGS.item_uuid_size,
                dtype=tf.int64),
            dimension=FLAGS.embed_size,
            initializer=tf.uniform_unit_scaling_initializer(factor=1e-5,
                                                            seed=1,
                                                            dtype=tf.float32))
        self.all_columns["userID"] = userID
        self.all_columns["itemID"] = itemID
        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.all_columns.values())

        return self
Exemplo n.º 3
0
def create_feature_columns():
  # user feature
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneBrand = fc.embedding_column(phoneBrandId, 20)
  phoneResolution = fc.embedding_column(phoneResolutionId, 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand,
                        phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Exemplo n.º 4
0
    def make_feature_layer(self):
        feature_cols = []

        for col in self.numeric_column:
            feature_cols.append(feature_column.numeric_column(col))

        for col in self.categorical_column_num:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=int(3 * unique_count)),
                dimension=1)
            feature_cols.append(feat_cols)

        for col in self.categorical_column_text:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=int(3 * unique_count)),
                dimension=1)
            feature_cols.append(feat_cols)

        for col in self.bool_column:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=3),
                dimension=1)
            feature_cols.append(feat_cols)
        return feature_cols
def build_ama_ele_columns():
    feature_columns = [
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'user_id', hash_bucket_size=200000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'item_id', hash_bucket_size=1000),
                            dimension=32),
        # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq',     hash_bucket_size=200000),dimension=32),
        # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq_cate',hash_bucket_size=200000),dimension=32),
    ]
    feat_field_size = len(feature_columns)
    return feature_columns, feat_field_size
Exemplo n.º 6
0
def create_feature_columns(train_data):
    n_users = train_data.user.nunique()
    users = fc.categorical_column_with_vocabulary_list("user", np.arange(n_users), default_value=-1, dtype=tf.int64)
    gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"])
    age = fc.categorical_column_with_vocabulary_list("age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64)
    occupation = fc.categorical_column_with_vocabulary_list("occupation", np.arange(21), dtype=tf.int64)

    all_feature_cols = [fc.embedding_column(users, 32),
                        fc.indicator_column(gender),
                        fc.embedding_column(age, 32),
                        fc.embedding_column(occupation, 32)]

    return all_feature_cols
Exemplo n.º 7
0
def build_model_columns(embedding_size):
    linear_feature_columns = []
    embedding_feature_columns = []

    u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64)
    u_id_embedded = feature_column.embedding_column(u_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(u_id))
    embedding_feature_columns.append(u_id_embedded)

    i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64)
    i_id_embedded = feature_column.embedding_column(i_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(i_id))
    embedding_feature_columns.append(i_id_embedded)

    return linear_feature_columns, embedding_feature_columns
Exemplo n.º 8
0
def build_features(statistics):
    pu_location_id = fc.categorical_column_with_identity(key='PULocationID',
                                                         num_buckets=265)
    do_location_id = fc.categorical_column_with_identity(key='DOLocationID',
                                                         num_buckets=265)
    day_of_week = fc.categorical_column_with_identity(key='day_of_week',
                                                      num_buckets=7)
    weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2)
    speed_buckets = fc.bucketized_column(
        fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70])
    distance_buckets = fc.bucketized_column(
        fc.numeric_column('trip_distance'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    duration_buckets = fc.bucketized_column(
        fc.numeric_column('duration'),
        boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])
    fare_buckets = fc.bucketized_column(
        fc.numeric_column('fare_amount'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    passenger_buckets = fc.bucketized_column(
        fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9])
    location = fc.crossed_column([pu_location_id, do_location_id],
                                 hash_bucket_size=1000)
    cross_all = fc.crossed_column([
        location, speed_buckets, distance_buckets, duration_buckets,
        fare_buckets, passenger_buckets
    ],
                                  hash_bucket_size=1000)
    categorical_columns = [
        fc.embedding_column(pu_location_id, dimension=32),
        fc.embedding_column(do_location_id, dimension=32),
        fc.indicator_column(day_of_week),
        fc.indicator_column(weekend)
    ]
    numeric_columns = [
        custom_numeric_column('passenger_count', statistics),
        custom_numeric_column('trip_distance', statistics),
        custom_numeric_column('fare_amount', statistics),
        custom_numeric_column('extra', statistics),
        custom_numeric_column('mta_tax', statistics),
        custom_numeric_column('tolls_amount', statistics),
        custom_numeric_column('improvement_surcharge', statistics),
        custom_numeric_column('duration', statistics),
        custom_numeric_column('speed', statistics)
    ]
    dnn_feature_columns = numeric_columns + categorical_columns
    linear_feature_columns = [location, cross_all]
    return dnn_feature_columns, linear_feature_columns
Exemplo n.º 9
0
    def create_features_columns(self):
        # 向量类特征
        user_vector = fc.numeric_column(key="user_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)

        # 分桶类特征
        age = fc.numeric_column(key="age",
                                shape=(1, ),
                                default_value=[0],
                                dtype=tf.int64)
        age = fc.bucketized_column(
            input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')

        # 分类特征
        city = fc.categorical_column_with_identity(key="city",
                                                   num_buckets=1000,
                                                   default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')

        # hash特征
        device_id = fc.categorical_column_with_hash_bucket(
            key="device_id", hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id,
                                        dimension=32,
                                        combiner='mean')

        item_id = fc.categorical_column_with_hash_bucket(
            key="item_id", hash_bucket_size=10000, dtype=tf.int64)
        item_id = fc.embedding_column(device_id, dimension=32, combiner='mean')

        self.user_columns["user_vector"] = user_vector
        self.user_columns["age"] = age
        self.user_columns["city"] = city
        self.user_columns["device_id"] = device_id
        self.item_columns["item_vector"] = item_vector
        self.item_columns["item_id"] = item_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.user_columns.values() + self.item_columns.values())

        return self
def define_feature_columns(dataframe):

    print("Defining feature columns...")
    feature_columns = []

    # Create embedding column for name IDs
    name_id = feature_column.categorical_column_with_vocabulary_list(
        'nconst', dataframe.nconst.unique())
    # Dimension set to 30 (approximately fourth root of the number of unique name IDs)
    name_id_embedding = feature_column.embedding_column(name_id, dimension=30)
    feature_columns.append(name_id_embedding)

    # Create indicator columns for category and genres
    indicator_column_names = ['category', 'genres']
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)

    # Create bucketized column for startYear (a.k.a. release date)
    start_year_numeric = feature_column.numeric_column('startYear')
    start_year_bucket = feature_column.bucketized_column(
        start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015])
    feature_columns.append(start_year_bucket)

    print("Feature columns defined")
    return feature_columns
Exemplo n.º 11
0
def test_multi_value_embedding():
    color_data = {
        'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'],
                  ['B', 'R']]
    }

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_embeding = feature_column.embedding_column(color_column, 7)
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embeding])
    builder = _LazyBuilder(color_data)
    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embeding' + '-' * 40)
        print(session.run([color_embeding_dense_tensor]))
Exemplo n.º 12
0
 def transform(self, output_tensors):
     input_tensor_name = self.parameters.get("input_tensor")
     output_tensor_name = self.parameters.get("output_tensor")
     if self.parameters.has_key("dimension"):
         dimension = self.parameters.get("dimension")
     else:
         msg = "parameters error, embedding_column must need dimension"
         logger.error(msg)
         raise ParametersError(msg)
     input_tensor = output_tensors.get(input_tensor_name)
     ckpt_to_load_from = None
     tensor_name_in_ckpt = None
     if self.parameters.has_key(
             "ckpt_to_load_from") and self.parameters.has_key(
                 "tensor_name_in_ckpt"):
         ckpt_to_load_from = self.parameters.get("ckpt_to_load_from")
         tensor_name_in_ckpt = self.parameters.get("tensor_name_in_ckpt")
     combiner = self.parameters.get("combiner") if self.parameters.has_key(
         "combiner") else "mean"
     output_tensor = fc.embedding_column(
         categorical_column=input_tensor,
         dimension=dimension,
         combiner=combiner,
         ckpt_to_load_from=ckpt_to_load_from,
         tensor_name_in_ckpt=tensor_name_in_ckpt)
     output_tensors[output_tensor_name] = output_tensor
Exemplo n.º 13
0
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string):
    f = feature_column.categorical_column_with_hash_bucket(
        feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype)
    f1 = feature_column.embedding_column(f, 4)  # 可以被训练的,就需要在main里面对变量进行初始化
    # 把feature_column对象转为 dense tensor,所有的feature_column都必须做这步
    feature_layer = tf.keras.layers.DenseFeatures([f1])  # 继承自Layer基类,会自动实现call方法
    return feature_layer
Exemplo n.º 14
0
    def _get_tf_feature_cols(dataframe: pd.DataFrame):
        feature_columns = []

        # numeric cols
        for header in ['PhotoAmt', 'Fee', 'Age']:
            feature_columns.append(feature_column.numeric_column(header))

        # bucketized cols
        age = feature_column.numeric_column('Age')
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[1, 2, 3, 4, 5])
        feature_columns.append(age_buckets)

        # indicator_columns
        indicator_column_names = [
            'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
            'Vaccinated', 'Sterilized', 'Health'
        ]
        for col_name in indicator_column_names:
            categorical_column = feature_column.categorical_column_with_vocabulary_list(
                col_name, dataframe[col_name].unique())
            indicator_column = feature_column.indicator_column(
                categorical_column)
            feature_columns.append(indicator_column)

        # embedding columns
        breed1 = feature_column.categorical_column_with_vocabulary_list(
            'Breed1', dataframe.Breed1.unique())
        breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
        feature_columns.append(breed1_embedding)
        return feature_columns
Exemplo n.º 15
0
def _add_weighted_embedding_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        weighted_column = fc.weighted_categorical_column(
            fc.categorical_column_with_vocabulary_list(f, vocabulary.vocab[f]), f + _WEIGHTED_SUFFIX)
        emb_weighted_column = fc.embedding_column(weighted_column, feature_table[f].emb_width, combiner='sqrtn')
        columns.append(emb_weighted_column)
Exemplo n.º 16
0
def _add_bucketed_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        # 如果是fixed_len的list特征
        if feature_table[f].feature_spec.is_list and feature_table[
                f].feature_spec.fixed:
            size = feature_table[f].feature_spec.size
            if feature_table[f].feature_spec.dtype == "int":
                numeric_col = fc.numeric_column(f,
                                                shape=(size, ),
                                                dtype=tf.int64,
                                                default_value=0)
            else:
                numeric_col = fc.numeric_column(f,
                                                shape=(size, ),
                                                default_value=0)
        # 如果不是list特征
        else:
            if feature_table[f].feature_spec.dtype == "int":
                numeric_col = fc.numeric_column(f,
                                                dtype=tf.int64,
                                                default_value=0)
            else:
                numeric_col = fc.numeric_column(f, default_value=0)
        bucketed_col = fc.bucketized_column(numeric_col,
                                            boundaries=BUCKET_BOUNDARIES[f])
        embedding_col = fc.embedding_column(bucketed_col,
                                            feature_table[f].emb_width,
                                            combiner='sqrtn')
        columns.append(embedding_col)
def test_embedding():
    tf.set_random_seed(1)
    #源数据
    color_data = {
        'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]
    }  # 4行样本
    builder = _LazyBuilder(color_data)

    # categorical_column  要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # tensor 数据源  将数据源表达成tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)

    #获取embedding_column; 第一个参数是:categorical_column;  第二个参数是维度
    color_embedding_column = feature_column.embedding_column(color_column,
                                                             4,
                                                             combiner='sum')

    # 转化为tensor  input_layer(数据源,column)  连接起数据源和embedding_column
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding_column])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))
        print('embeding' + '_' * 40)
        print(session.run([color_embeding_dense_tensor]))
Exemplo n.º 18
0
def input_feature_columns(ngram_vocab,
                          ngram_dimension,
                          ngram_oov=1,
                          ngram_combiner='sum'):
    ngram_categorial_column = sequence_categorical_column_with_vocabulary_list(
        key='ngrams',
        vocabulary_list=ngram_vocab,
        dtype=tf.string,
        num_oov_buckets=ngram_oov,
    )
    ngram_embedding_column = core_columns.embedding_column(
        categorical_column=ngram_categorial_column,
        dimension=ngram_dimension,
        combiner=ngram_combiner,
    )

    return [
        ngram_embedding_column,
        contrib_columns.sequence_numeric_column('word_length'),
        contrib_columns.sequence_numeric_column('is_no_case'),
        contrib_columns.sequence_numeric_column('is_lower_case'),
        contrib_columns.sequence_numeric_column('is_upper_case'),
        contrib_columns.sequence_numeric_column('is_title_case'),
        contrib_columns.sequence_numeric_column('is_mixed_case'),
    ]
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string):
    f = feature_column.categorical_column_with_hash_bucket(
        feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype)
    f1 = feature_column.embedding_column(f, 4)
    feature_layer = tf.keras.layers.DenseFeatures(
        [f1])  # 把feature_column对象转为tensor
    return feature_layer
Exemplo n.º 20
0
def test_embedding():
    tf.set_random_seed(1)
    # 1. Input features
    color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    color_embedding = feature_column.embedding_column(color_column,
                                                      4,
                                                      combiner='sum')
    # 3. Feature tensor
    color_embedding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding])

    with tf.Session() as session:
        # Embedding needs variables (weights) to do the embedding
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embedding' + '_' * 40)
        print(session.run([color_embedding_dense_tensor]))
Exemplo n.º 21
0
def get_item_feature_columns(business_vocab_list, item_type_dict):

    items_feature_columns = []

    bucketized_boundary = {'stars': [2.5, 4]}
    embedding_size = {"categories": 8, "city": 4}

    for k, v in business_vocab_list.items():

        if k in ['review_count']:
            col = numeric_column(k, default_value=0, dtype=item_type_dict[k])
        elif k in ['stars']:
            col = bucketized_column(
                numeric_column(k, default_value=0, dtype=item_type_dict[k]),
                bucketized_boundary[k])
        elif k in ['categories', 'city']:
            col = embedding_column(categorical_column_with_vocabulary_list(
                k, sorted(v), default_value=-1, dtype=item_type_dict[k]),
                                   dimension=embedding_size[k])
        else:
            col = indicator_column(
                categorical_column_with_vocabulary_list(
                    k, sorted(v), default_value=-1, dtype=item_type_dict[k]))

        items_feature_columns.append(col)

    return items_feature_columns
Exemplo n.º 22
0
    def _make_crossed(self):
        """Makes crossed features for both Wide or Deep network.

    Returns:
      Tuple (crossed columns for Wide, its dimension)
    """
        # Crossed columns
        f_crossed_for_wide = []
        f_crossed_for_deep = []
        for to_cross in self.CROSSED:
            keys = []
            bck_size = 1
            for (key, bck, bnd) in to_cross:
                keys.append(self._prepare_for_crossing(key, bck, bnd))
                bck_size *= bck

            # We can't go crazy on the dim for crossed_column so use a min
            # **0.25 is a rule of thumb for bucket size vs dimension
            t_crossed = tfc.crossed_column(keys, min(bck_size, 10000))
            t_dimension = int(bck_size**0.25)
            f_crossed_for_wide.append(t_crossed)
            f_crossed_for_deep.append(
                tfc.embedding_column(t_crossed, t_dimension))

        return f_crossed_for_wide, f_crossed_for_deep
Exemplo n.º 23
0
 def hash_embedding(self, hash_bucket, embedding_dim, name):
     cate_feature = feature_column.categorical_column_with_hash_bucket(
         name, hash_bucket, dtype=tf.string)
     emb_col = feature_column.embedding_column(cate_feature,
                                               dimension=embedding_dim,
                                               combiner='mean')
     ind_col = feature_column.indicator_column(cate_feature)
     return emb_col, ind_col
Exemplo n.º 24
0
def test_weighted_categorical_feature_embedding():
    color_data = {
        'color': [['R', 'R'], ['G', 'G'], ['B', 'B'], ['G', 'R'], ['G', 'B'],
                  ['B', 'R']],
        'weight': [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.2], [0.4, 0.3],
                   [0.4, 0.6]]
    }  # 6行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_embeding = feature_column.embedding_column(color_column,
                                                     7,
                                                     combiner="sum")
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embeding])

    color_weight_categorical_column = feature_column.weighted_categorical_column(
        color_column, 'weight')
    color_embeding_weighted = feature_column.embedding_column(
        color_weight_categorical_column, 7, combiner="sum")
    color_embeding_dense_tensor_2 = feature_column.input_layer(
        color_data, [color_embeding_weighted])

    builder = _LazyBuilder(color_data)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    color_weighted_tensor = color_weight_categorical_column._get_sparse_tensors(
        builder)  ## is a pair (id_tensor, weight_tensor)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))
        print("color column weight:")
        print(color_column_tensor.weight_tensor)
        print("color column weighted categorical,  weight:")
        print(session.run([color_weighted_tensor.id_tensor]))
        print(session.run([color_weighted_tensor.weight_tensor]))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embeding' + '-' * 40)
        print(session.run([color_embeding_dense_tensor]))
        print('embeding weighted categorical column')
        print(session.run([color_embeding_dense_tensor_2]))
Exemplo n.º 25
0
    def data_preprocessing(self):
        """
        batch_size = 5  # 예제를 위해 작은 배치 크기를 사용합니다.
        train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size)
        test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)

        for feature_batch, label_batch in train_ds.take(1):
            print('전체 특성:', list(feature_batch.keys()))
            print('나이 특성의 배치:', feature_batch['age'])
            print('타깃의 배치:', label_batch)

        # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
        self.example_batch = next(iter(train_ds))[0]

        age = feature_column.numeric_column("age")
        self.demo(age)
        """
        feature_columns = []

        # 수치형 열
        for header in [
                'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
        ]:
            feature_columns.append(feature_column.numeric_column(header))

        # 버킷형 열
        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
        feature_columns.append(age_buckets)

        # 범주형 열
        thal = feature_column.categorical_column_with_vocabulary_list(
            'thal', ['fixed', 'normal', 'reversible'])
        thal_one_hot = feature_column.indicator_column(thal)
        feature_columns.append(thal_one_hot)

        # 임베딩 열
        thal_embedding = feature_column.embedding_column(thal, dimension=8)
        feature_columns.append(thal_embedding)

        # 교차 특성 열
        crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                        hash_bucket_size=1000)
        crossed_feature = feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature)

        self.feature_layer = layers.DenseFeatures(feature_columns)

        batch_size = 32
        self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        self.val_ds = self.df_to_dataset(self.val,
                                         shuffle=False,
                                         batch_size=batch_size)
        self.test_ds = self.df_to_dataset(self.test,
                                          shuffle=False,
                                          batch_size=batch_size)
Exemplo n.º 26
0
def _add_embedding_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        cate_col = fc.categorical_column_with_vocabulary_list(
            f, vocabulary.vocab[f])
        column = fc.embedding_column(cate_col,
                                     feature_table[f].emb_width,
                                     combiner='sqrtn')
        columns.append(column)
Exemplo n.º 27
0
def embedding_column(key,
                     dimension,
                     vocabulary_size=None,
                     vocabulary_file=None,
                     vocabulary_list=None,
                     num_oov_buckets=0):
    return feature_column.embedding_column(categorical_column(
        key, vocabulary_size, vocabulary_list, vocabulary_file,
        num_oov_buckets),
                                           dimension=dimension)
Exemplo n.º 28
0
 def embeddings_columns(self, coldim_dict):
     for col_name, dimension in coldim_dict.items():
         #embCol    = feature_column.categorical_column_with_vocabulary_list(col_name, colunique )
         bucket_size = dimension * dimension
         embCol = feature_column.categorical_column_with_hash_bucket(
             col_name, hash_bucket_size=bucket_size)
         embedding = feature_column.embedding_column(embCol,
                                                     dimension=dimension)
         self.real_columns[col_name] = embedding
     return embedding
 def _build_census_deep_columns(emb_dim=8, numeric_range=None):
     feature_columns = []
     for col in ALI_DISPLAY_ADS_CONFIG['deep_emb_cols']:
         feature_columns.append(
             fc.embedding_column(fc.categorical_column_with_hash_bucket(
                 col,
                 hash_bucket_size=1000
                 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                 ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000),
                                 dimension=emb_dim))
     for col in ALI_DISPLAY_ADS_CONFIG['deep_bucket_emb_cols']:
         feature_columns.append(
             fc.embedding_column(fc.bucketized_column(
                 fc.numeric_column(col),
                 boundaries=list(
                     np.linspace(numeric_range[col][0],
                                 numeric_range[col][1], 1000))),
                                 dimension=emb_dim))
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
Exemplo n.º 30
0
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets):
    # Pass-through columns
    transformed = inputs.copy()
    del transformed['pickup_datetime']

    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0,
                                             name='scale_{}'.format(lon_col))(
                                                 inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0,
                                             name='scale_{}'.format(lat_col))(
                                                 inputs[lat_col])

    # Adding Euclidean dist (no need to be accurate: NN will calibrate it)
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickup_longitude'], inputs['pickup_latitude'],
        inputs['dropoff_longitude'], inputs['dropoff_latitude']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # hour of day from timestamp of form '2010-02-08 09:17:00+00:00'
    transformed['hourofday'] = layers.Lambda(
        lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2),
                                       out_type=tf.dtypes.int32),
        name='hourofday')(inputs['pickup_datetime'])
    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', num_buckets=24))

    latbuckets = np.linspace(0, 1, nbuckets).tolist()
    lonbuckets = np.linspace(0, 1, nbuckets).tolist()
    b_plat = fc.bucketized_column(feature_columns['pickup_latitude'],
                                  latbuckets)
    b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'],
                                  latbuckets)
    b_plon = fc.bucketized_column(feature_columns['pickup_longitude'],
                                  lonbuckets)
    b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'],
                                  lonbuckets)
    ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4)
    feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100)

    return transformed, feature_columns