示例#1
0
]

# 2.对特征标签进行编码
for feature in sparse_categorical_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

# for  feature in sparse_categorical_features :
#     print (data[feature].nunique())

#  3. 处理单值离散特征
feature_columns = []
for feature in sparse_categorical_features:
    feature_columns.append(
        SparseFeat(feature,
                   data[feature].nunique(),
                   embedding_dim=4,
                   use_hash=False))


# 4. 处理多值离散特征
#  4.1 生成词表,
def get_table(data, feature_name, sep='|'):
    s = set()
    for line in data[feature_name]:
        s.update(str(line).split(sep))
    s.add("<pad>")
    return len(s), s


max_len, table = get_table(data, 'genres', sep='|')
# 4.2 生成索引
示例#2
0
                    tf.data.experimental\
                    .prefetch_to_device('/gpu:0', buffer_size=num_para))
else:
    D_train = D_train_r.shard(
        num_workers, worker_index).repeat().prefetch(buffer_size=num_para)
    D_valid = D_valid_r.shard(
        num_workers, worker_index).repeat().prefetch(buffer_size=num_para)

# %%
embedding_size = NNconfig_dic["embedding_size"]
sparse_feature_columns = []
varlen_feature_columns = []

sparse_feature_columns = [
    SparseFeat(feat,
               sparse_vcab_dic[feat] + 1,
               dtype=tf.int64,
               embedding_dim=embedding_size) for feat in sparse_f
]
varlen_feature_columns = [
    VarLenSparseFeat(SparseFeat(vfeat,
                                vocabulary_size=varlen_vcab_dic[vfeat] + 1,
                                dtype=tf.int64,
                                embedding_dim=embedding_size),
                     maxlen=varlen_maxlen_f[vfeat]) for vfeat in varlen_f
]

# %%
linear_feature_columns, dnn_feature_columns = \
    sparse_feature_columns + varlen_feature_columns, sparse_feature_columns + varlen_feature_columns

# %%
示例#3
0
sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'device']
dense_features = ['time', 'duration_time']

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)

target = ['finish']
# target = ['like']

for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

sparse_feature_columns = [SparseFeat(feat, data[feat].nunique())  #(特征名, 特征不同取值个数)生成SparseFeat对象,name == 特征名,dimension==该特征不同取值个数, dtype ==int32
                        for feat in sparse_features]
dense_feature_columns = [DenseFeat(feat, 1)  #(特征名, dimension==1) 数据dtype == float32
                      for feat in dense_features]
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

## 这里有多余的步骤,该方法中间为每个特征设置了Input层,但是没有返回,只返回了特征名称list,其实可以直接从上面的两个list合并得到。
feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)


train, test = train_test_split(data, test_size=0.1)
train_model_input = [train[name] for name in feature_names]
test_model_input = [test[name] for name in feature_names]

#model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
示例#4
0
    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(
        genres_list,
        maxlen=max_len,
        padding='post',
    )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
        for feat in sparse_features
    ]

    use_weighted_sequence = False
    if use_weighted_sequence:
        varlen_feature_columns = [
            VarLenSparseFeat(SparseFeat('genres',
                                        vocabulary_size=len(key2index) + 1,
                                        embedding_dim=4),
                             maxlen=max_len,
                             combiner='mean',
                             weight_name='genres_weight')
        ]  # Notice : value 0 is for padding for sequence input feature
    else:
        varlen_feature_columns = [
def get_xy_fd(hash_flag=False):

    feature_columns = [
        SparseFeat('user', 3, hash_flag),
        SparseFeat('gender', 2, hash_flag),
        SparseFeat('item', 3 + 1, hash_flag),
        SparseFeat('item_gender', 2 + 1, hash_flag),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat('sess_0_item',
                         3 + 1,
                         4,
                         use_hash=hash_flag,
                         embedding_name='item'),
        VarLenSparseFeat('sess_0_item_gender',
                         2 + 1,
                         4,
                         use_hash=hash_flag,
                         embedding_name='item_gender')
    ]
    feature_columns += [
        VarLenSparseFeat('sess_1_item',
                         3 + 1,
                         4,
                         use_hash=hash_flag,
                         embedding_name='item'),
        VarLenSparseFeat('sess_1_item_gender',
                         2 + 1,
                         4,
                         use_hash=hash_flag,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess_0_item': sess1_iid,
        'sess_0_item_gender': sess1_igender,
        'score': score,
        'sess_1_item': sess2_iid,
        'sess_1_item_gender': sess2_igender,
    }

    fixlen_feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in fixlen_feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    x += [sess_number]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
示例#6
0
def get_xy_fd(hash_flag=False):

    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item', 3 + 1, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_0_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_0_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_1_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess_0_item': sess1_iid,
        'sess_0_item_gender': sess1_igender,
        'score': score,
        'sess_1_item': sess2_iid,
        'sess_1_item_gender': sess2_igender,
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    x["sess_length"] = sess_number
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
                    label=label)
    csv_to_tfrecord(val_filename,
                    output_filedir=os.path.join(save_dir, 'val_tfrecord'),
                    dense_feature_names=dense_feature_names,
                    sparse_feature_names=sparse_feature_names,
                    label=label)
    csv_to_tfrecord(test_filename,
                    output_filedir=os.path.join(save_dir, 'test_tfrecord'),
                    dense_feature_names=dense_feature_names,
                    sparse_feature_names=sparse_feature_names,
                    label=None)

    dense_feature_columns = [DenseFeat(feat) for feat in dense_feature_names]

    sparse_feature_columns = [
        SparseFeat(feat, vocab_dict[feat], embedding_dim=4)
        for feat in sparse_feature_names
    ]

    linear_feature_columns = dense_feature_columns + sparse_feature_columns
    dnn_feature_columns = dense_feature_columns + sparse_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   dnn_hidden_units=[64, 64],
                   task='binary')

    model.compile(
示例#8
0
def NCF(user_feature_columns,
        item_feature_columns,
        user_gmf_embedding_dim=20,
        item_gmf_embedding_dim=20,
        user_mlp_embedding_dim=20,
        item_mlp_embedding_dim=20,
        dnn_use_bn=False,
        dnn_hidden_units=(64, 32),
        dnn_activation='relu',
        l2_reg_dnn=0,
        l2_reg_embedding=1e-6,
        dnn_dropout=0,
        init_std=0.0001,
        seed=1024):
    """Instantiates the NCF Model architecture.

    :param user_feature_columns: A dict containing user's features and features'dim.
    :param item_feature_columns: A dict containing item's features and features'dim.
    :param user_gmf_embedding_dim: int.
    :param item_gmf_embedding_dim: int.
    :param user_mlp_embedding_dim: int.
    :param item_mlp_embedding_dim: int.
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
    :param dnn_activation: Activation function to use in deep net
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param init_std: float,to use as the initialize std of embedding vector
    :param seed: integer ,to use as random seed.
    :return: A Keras model instance.

    """

    user_dim = len(user_feature_columns) * user_gmf_embedding_dim
    item_dim = len(item_feature_columns) * item_gmf_embedding_dim
    dim = (user_dim * item_dim) / (math.gcd(user_dim, item_dim))
    user_gmf_embedding_dim = int(dim / len(user_feature_columns))
    item_gmf_embedding_dim = int(dim / len(item_feature_columns))

    # Generalized Matrix Factorization (GMF) Part
    user_gmf_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=size,
                   embedding_dim=user_gmf_embedding_dim)
        for feat, size in user_feature_columns.items()
    ]
    user_features = build_input_features(user_gmf_feature_columns)
    user_inputs_list = list(user_features.values())
    user_gmf_sparse_embedding_list, user_gmf_dense_value_list = input_from_feature_columns(
        user_features,
        user_gmf_feature_columns,
        l2_reg_embedding,
        init_std,
        seed,
        prefix='gmf_')
    user_gmf_input = combined_dnn_input(user_gmf_sparse_embedding_list, [])
    user_gmf_out = Lambda(lambda x: x,
                          name="user_gmf_embedding")(user_gmf_input)

    item_gmf_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=size,
                   embedding_dim=item_gmf_embedding_dim)
        for feat, size in item_feature_columns.items()
    ]
    item_features = build_input_features(item_gmf_feature_columns)
    item_inputs_list = list(item_features.values())
    item_gmf_sparse_embedding_list, item_gmf_dense_value_list = input_from_feature_columns(
        item_features,
        item_gmf_feature_columns,
        l2_reg_embedding,
        init_std,
        seed,
        prefix='gmf_')
    item_gmf_input = combined_dnn_input(item_gmf_sparse_embedding_list, [])
    item_gmf_out = Lambda(lambda x: x,
                          name="item_gmf_embedding")(item_gmf_input)

    gmf_out = Multiply()([user_gmf_out, item_gmf_out])

    # Multi-Layer Perceptron (MLP) Part
    user_mlp_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=size,
                   embedding_dim=user_mlp_embedding_dim)
        for feat, size in user_feature_columns.items()
    ]
    user_mlp_sparse_embedding_list, user_mlp_dense_value_list = input_from_feature_columns(
        user_features,
        user_mlp_feature_columns,
        l2_reg_embedding,
        init_std,
        seed,
        prefix='mlp_')
    user_mlp_input = combined_dnn_input(user_mlp_sparse_embedding_list,
                                        user_mlp_dense_value_list)
    user_mlp_out = Lambda(lambda x: x,
                          name="user_mlp_embedding")(user_mlp_input)

    item_mlp_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=size,
                   embedding_dim=item_mlp_embedding_dim)
        for feat, size in item_feature_columns.items()
    ]

    item_mlp_sparse_embedding_list, item_mlp_dense_value_list = input_from_feature_columns(
        item_features,
        item_mlp_feature_columns,
        l2_reg_embedding,
        init_std,
        seed,
        prefix='mlp_')
    item_mlp_input = combined_dnn_input(item_mlp_sparse_embedding_list,
                                        item_mlp_dense_value_list)
    item_mlp_out = Lambda(lambda x: x,
                          name="item_mlp_embedding")(item_mlp_input)

    mlp_input = Concatenate(axis=1)([user_mlp_out, item_mlp_out])
    mlp_out = DNN(dnn_hidden_units,
                  dnn_activation,
                  l2_reg_dnn,
                  dnn_dropout,
                  dnn_use_bn,
                  seed,
                  name="mlp_embedding")(mlp_input)

    # Fusion of GMF and MLP
    neumf_input = Concatenate(axis=1)([gmf_out, mlp_out])
    neumf_out = DNN(hidden_units=[1], activation='sigmoid')(neumf_input)
    output = Lambda(lambda x: x, name='neumf_out')(neumf_out)

    # output = PredictionLayer(task, False)(neumf_out)

    model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)

    return model
示例#9
0
def structural_feature(train, test):
    test['label'] = -1
    data = pd.concat([train, test], axis=0)
    '''特征工程 >>>>>'''
    # data['year'] = data['date'].dt.year
    # data['month'] = data['date'].dt.month
    # data['day'] = data['date'].dt.day
    data['hour'] = data['date'].dt.hour
    del data['date']

    data['D1+D2'] = data['D1'] + data['D2']
    data['D1-D2'] = data['D1'] - data['D2']
    data['D1/D2'] = data['D1'] / data['D2']

    # data['A_sum'] = data['A1'] + data['A2'] + data['A3']
    data['B_sum'] = data['B1'] + data['B2'] + data['B3']
    # data['C_sum'] = data['C1'] + data['C2'] + data['C3']

    data['A_*'] = data['A1'] * data['A2'] * data['A3']
    data['B_*'] = data['B1'] * data['B2'] * data['B3']
    # data['C_*'] = data['C1'] * data['C2'] * data['C3']

    data['A_+'] = data['A1'] + data['A2'] + data['A3']
    data['B_+'] = data['B1'] + data['B2'] + data['B3']
    data['C_+'] = data['C1'] + data['C2'] + data['C3']

    normalization_columns = [
        'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'E2', 'E3', 'E5',
        'E7', 'E9', 'E10', 'E13', 'E16', 'E17', 'E19', 'E21', 'E22'
    ]
    for column in normalization_columns:
        data[column] = (data[column] - data[column].min(axis=0)) / (
            data[column].max(axis=0) - data[column].min(axis=0))

    sparse_features = [
        'D1', 'D2', 'E4', 'E8', 'E11', 'E15', 'E18', 'E25', 'hour'
    ]
    dense_features = [
        'E1', 'E2', 'E3', 'E5', 'E6', 'E7', 'E9', 'E10', 'E12', 'E13', 'E14',
        'E16', 'E17', 'E16', 'E17', 'E19', 'E20', 'E21', 'E22', 'E23', 'E24',
        'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3'
    ]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    '''特征工程结束 <<<<'''

    train = data[data.label != -1]
    test = data[data.label == -1]
    del test['label']
    '''调整特征顺序'''
    l = train['label']
    del train['label']
    train['label'] = l
    return train, test, feature_names, linear_feature_columns, dnn_feature_columns
示例#10
0
def main(args):

    if args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label
        s = time.time()
        print(f'before test article preprocess : {len(item)}')

        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature', 'read_cnt_prob']
        target = ['label']

        ############################ make more feature !!!!!!! #################################
        ############## 1. read_article_ids len cnt -- user feature #################################################
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)

        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')

        id_to_artic = dict()
        artics = item['article_id'].tolist()

        ################ 2. read_cnt, total_cnt, prob_read_cnt --- article feature ####################################
        read_cnt = item[item['label'] == 1].groupby('article_id').agg(
            {'hh': 'count'})
        read_cnt = read_cnt.reset_index()
        read_cnt = read_cnt.rename(columns={'hh': 'read_cnt'})

        read_cnt_list = read_cnt['read_cnt'].tolist()
        read_cnt_artic_list = read_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(read_cnt)}')
        print(read_cnt.head(3))

        total_cnt = item.groupby('article_id').agg({'hh': 'count'})
        total_cnt = total_cnt.reset_index()
        total_cnt = total_cnt.rename(columns={'hh': 'read_cnt'})
        total_cnt_list = total_cnt['read_cnt'].tolist()
        total_cnt_artic_list = total_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(total_cnt)}')
        print(total_cnt.head(3))

        # lit # test_article_ids list
        lit_cnt = []
        lit_total_cnt = []
        lit_cnt_prob = []
        lit = list(set(artics))
        lit.sort()
        print(lit[:10])
        print(f'len(lit):{len(lit)}')
        for i in range(len(lit)):
            # lit_cnt
            cur_artic = lit[i]
            if cur_artic not in read_cnt_artic_list:
                lit_cnt.append(0)
            else:
                for j in range(len(read_cnt_artic_list)):
                    if cur_artic == read_cnt_artic_list[j]:
                        lit_cnt.append(read_cnt_list[j])
                        break
            # lit_total_cnt
            if cur_artic not in total_cnt_artic_list:
                lit_total_cnt.append(0)
            else:
                for j in range(len(total_cnt_artic_list)):
                    if cur_artic == total_cnt_artic_list[j]:
                        lit_total_cnt.append(total_cnt_list[j])
                        break
            # lit_cnt_prob
            if lit_total_cnt[i] == 0:
                lit_cnt_prob.append(0)
            else:
                lit_cnt_prob.append(lit_cnt[i] / lit_total_cnt[i])
        print('--- read_cnt article feature completed ---')
        print(f'lit_cnt {len(lit_cnt)}')
        print(f'lit_total_cnt {len(lit_total_cnt)}')
        print(f'lit_cnt_prob {len(lit_cnt_prob)}')

        #### fea
        print('feature dict generate')
        file_list1 = os.listdir(DATASET_PATH)
        file_list2 = os.listdir(DATASET_PATH + '/train')
        file_list3 = os.listdir(DATASET_PATH + '/train/train_data')

        print(file_list1)
        print(file_list2)
        print(file_list3)
        resnet_feature_extractor(args.mode)

        print(file_list1)
        print(file_list2)
        print(file_list3)

        # One hot Encoding
        with open(os.path.join('train_image_features_50.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)

        print('check artic feature')
        print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}")

        lbe = LabelEncoder()
        lbe.fit(lit)
        item['article_id' + '_onehot'] = lbe.transform(item['article_id'])
        print(lbe.classes_)

        for feat in sparse_features[1:]:
            lbe = LabelEncoder()
            item[feat + '_onehot'] = lbe.fit_transform(
                item[feat])  # 이때 고친 라벨이 같은 라벨인지도 필수로 확인해야함

        print(item.head(10))
        print('columns name : ', item.columns)
        fixlen_feature_columns = [SparseFeat('article_id', len(lit))]
        fixlen_feature_columns += [
            SparseFeat(feat, item[feat + '_onehot'].nunique())
            for feat in sparse_features[1:]
        ]
        fixlen_feature_columns += [
            DenseFeat('image_feature', len(image_feature_dict[artics[0]]))
        ]
        fixlen_feature_columns += [DenseFeat('read_cnt_prob', 1)]

        print(f'fixlen_feature_columns : {fixlen_feature_columns}')
        idx_artics_all = item['article_id' + '_onehot'].tolist()

        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]

        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
        print('---model defined---')
        print(time.time() - s, 'seconds')

        ##### print need

        for artic in lit:
            print(artic, end=',')
        print()
        print('new')
        print()

        print(len(lit_cnt_prob))
        for prob in lit_cnt_prob:
            prob = round(prob, 4)
            print(prob, end=',')
        print()
        print('end')
        print('--------------')

    optimizer = tf.keras.optimizers.Adam(args.lr)
    s = time.time()

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]

    dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42)
    dn_2 = item_neg.sample(n=3 * len(item_pos), random_state=20)
    dn_3 = item_neg.sample(n=3 * len(item_pos), random_state=7)
    dn_4 = item_neg.sample(n=3 * len(item_pos), random_state=33)
    dn_5 = item_neg.sample(n=3 * len(item_pos), random_state=41)

    dn_1.reset_index()

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_1_article_idxs = data_1['article_id_onehot'].tolist()
    data_1_article = data_1['article_id'].tolist()
    print(f'len data_1 : {len(data_1)}')
    print(data_1.head(5))
    li1 = []
    li2 = []
    li3 = []
    for i in range(len(data_1_article)):
        for j in range(len(lit_cnt_prob)):
            if data_1_article[i] == lit[j]:
                li3.append(lit_cnt_prob[j])
                break
    data_1['read_cnt_prob'] = li3
    print('---read_cnt_prob end---')
    ## preprocess append

    data_2 = pd.concat([dn_2, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_3 = pd.concat([dn_3, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_4 = pd.concat([dn_4, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_5 = pd.concat([dn_5, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()

    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)
    print(f'article_id : {data_1_article[0]}')
    print(f'article_image_feature : {image_feature_dict[data_1_article[0]]}')

    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:
        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')

        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        #k_fold 할때는 check point 빼자
        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=100,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')
示例#11
0
                               scaler,
                               splits=1,
                               feats=feats,
                               batch_size=2048,
                               shuffle=False,
                               debug=False,
                               use_cache=False)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=len(encoder.get_labels(feat)),
                   embedding_dim=4) for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    #train, test = train_test_split(data, test_size=0.2)
示例#12
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
    item_profile = data[["movie_id"]].drop_duplicates('movie_id')

    user_profile.set_index("user_id", inplace=True)

    user_item_list = data.groupby("user_id")['movie_id'].apply(list)

    train_set, test_set = gen_data_set(data, negsample)

    train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
    test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 16

    user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                            SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
                            SparseFeat("age", feature_max_idx['age'], embedding_dim),
                            SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
                            SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
                            VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                            ]

    item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

    # 3.Define Model and train

    model = DSSM(user_feature_columns, item_feature_columns)  # FM(user_feature_columns,item_feature_columns)

    model.compile(optimizer='adagrad', loss="binary_crossentropy")
示例#14
0
文件: main.py 项目: Xierry/DeepCTRCVR
        'pay_score':
        np.array([0.1, 0.2, 0.3, 0.2] * n_copy),
        'context':
        np.array([0, 1, 0, 1] * n_copy),
        # 'seq_length': np.array([3, 4, 2, 2])
    }

    y_ctr = np.array([1, 1, 1, 0] * n_copy)
    y_cvr = np.array([1, 0, 1, 0] * n_copy)
    y_ctcvr = np.array([1, 0, 1, 0] * n_copy)

    # 用户特征
    user_feature_columns = [
        DenseFeat('pay_score', dimension=1),
        SparseFeat('user',
                   vocabulary_size=len(np.unique(X["user"])),
                   embedding_dim=embedding_dim),
        SparseFeat('gender',
                   vocabulary_size=len(np.unique(X["gender"])),
                   embedding_dim=embedding_dim),
        VarLenSparseFeat(  # 0 值表示填充, 自动过滤
            SparseFeat('hist_item_id',
                       vocabulary_size=len(np.unique(X["hist_item_id"][0])),
                       embedding_dim=embedding_dim,
                       embedding_name='item_id'),
            maxlen=len(X["hist_item_id"][0]),
            combiner="max",  # "mean", "sum"
            length_name=None,  # length_name="seq_length"
        ),
        VarLenSparseFeat(
            SparseFeat('hist_cate_id',
示例#15
0
                       num_iteration=gbm.best_iteration,
                       pred_leaf=True)
print('Writing transformed training data')
transformed_training_matrix = np.zeros(
    [len(lgb_pred), len(lgb_pred[0]) * num_leaves],
    dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(lgb_pred)):
    temp = np.arange(len(lgb_pred[0])) * num_leaves + np.array(lgb_pred[i])
    transformed_training_matrix[i][temp] += 1

print('deep training...')

lgb_feat = pd.DataFrame(transformed_training_matrix.tolist())
lgb_feat.columns = [str(i) for i in lgb_feat.columns]
fixlen_feature_columns = [
    SparseFeat(feat, lgb_feat[feat].nunique()) for feat in lgb_feat.columns
]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                dnn_feature_columns)
train_model_input = [lgb_feat[name] for name in fixlen_feature_names]
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile(
    "adam",
    loss=losses.mae,
    metrics=['accuracy', 'mse'],
)
history = model.fit(
    train_model_input,
    y_train.values,
示例#16
0
文件: utils.py 项目: zwcdp/DeepMatch
def get_test_data(sample_size=1000,
                  embedding_size=4,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max', 'weight'],
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix='',
                  use_group=False):
    feature_columns = []
    model_input = {}

    if 'weight' in sequence_feature:
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + "weighted_seq",
                                        vocabulary_size=2,
                                        embedding_dim=embedding_size),
                             maxlen=3,
                             length_name=prefix + "weighted_seq" +
                             "_seq_length",
                             weight_name=prefix + "weight"))
        s_input, s_len_input = gen_sequence(2, 3, sample_size)

        model_input[prefix + "weighted_seq"] = s_input
        model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1)
        model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i % 3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i),
                       dim,
                       embedding_size,
                       use_hash=hash_flag,
                       dtype=tf.int32,
                       group_name=group_name))

    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode,
                                        vocabulary_size=dim,
                                        embedding_dim=embedding_size),
                             maxlen=maxlen,
                             combiner=mode))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size,
                                                     sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen,
                                                sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix + "sequence_" + str(i) + '_seq_length'
                model_input[prefix + "sequence_" + str(i) +
                            '_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
示例#17
0
    'C17', 'C18', 'C19', 'C21'
]

train_features = feature1 + feature2 + feature3
for feature in train_features:
    encoder = LabelEncoder()
    train_data[feature] = encoder.fit_transform(train_data[feature])
target = ['click']

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, get_feature_names

# 计算每个特征中的 不同特征值的个数
fixlen_feature_column1 = [
    SparseFeat(name=feature,
               vocabulary_size=int(train_data[feature].nunique() * 0.01),
               embedding_dim=4,
               use_hash=True) for feature in feature1
]
fixlen_feature_column2 = [
    SparseFeat(name=feature,
               vocabulary_size=int(train_data[feature].nunique() * 0.05),
               embedding_dim=4,
               use_hash=True) for feature in feature2
]

fixlen_feature_column3 = [
    SparseFeat(name=feature,
               vocabulary_size=train_data[feature].nunique(),
               embedding_dim=4,
               use_hash=False) for feature in feature3
]
示例#18
0
for feat in sparse_features:
    label_enc = LabelEncoder()
    data[feat] = label_enc.fit_transform(data[feat])

feats = [i for i in data.columns if i != 'Rating']
X = data[feats]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sparse_features = [
    'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday'
]
dense_features = ['hour', 'Age']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \
                         [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass')
model.compile('adam', 'mse', metrics=['accuracy'])

feature_names = get_feature_names(fixlen_feature_columns)

train_feed_dict = {name: X_train[name] for name in feature_names}
test_feed_dict = {name: X_test[name] for name in feature_names}

model.fit(train_feed_dict,
          y_train,
示例#19
0
    user_item_list = data.groupby("user_id")['movie_id'].apply(list)

    train_set, test_set = gen_data_set(data, negsample)

    train_model_input, train_label = gen_model_input(train_set, user_profile,
                                                     SEQ_LEN)
    test_model_input, test_label = gen_model_input(test_set, user_profile,
                                                   SEQ_LEN)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 16

    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
        SparseFeat("age", feature_max_idx['age'], embedding_dim),
        SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
        SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
        VarLenSparseFeat(
            SparseFeat('hist_movie_id',
                       feature_max_idx['movie_id'],
                       embedding_dim,
                       embedding_name="movie_id"), SEQ_LEN, 'mean',
            'hist_len'),
    ]

    item_feature_columns = [
        SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)
    ]
示例#20
0
def loadData(trainFile,
             testFile,
             embedding_dim,
             multivalue_len,
             multiClass=False):
    train = pd.read_csv(trainFile)
    test = pd.read_csv(testFile)

    ##1. feature type declarion
    sparse_features = [
        "BaseAdGroupId", "Criteria", 'placementType', 'Week', 'IsRestrict',
        'IsNegative', 'AccountTimeZone', 'AccountCurrencyCode',
        'BiddingStrategyType', 'CampaignId', 'Month'
    ]

    dense_features = [
        'adClicks', 'adConversions', 'adCtr', 'adConversionRate',
        'adActiveViewImpressions', 'adActiveViewMeasurability',
        'adActiveViewMeasurableCost', 'adActiveViewViewability',
        'adImpressions', 'adActiveViewCpm', 'adAverageCpc', 'adAverageCpe',
        'adCpcBid', 'adActiveViewMeasurableImpressions', 'adActiveViewCtr',
        'adAverageCpm', 'adAverageCpv', 'adCost', 'plaClicks',
        'plaConversions', 'plaCtr', 'plaConversionRate',
        'plaActiveViewImpressions', 'plaActiveViewMeasurability',
        'plaActiveViewMeasurableCost', 'plaActiveViewViewability',
        'plaImpressions', 'plaCpcBid', 'plaActiveViewMeasurableImpressions',
        'plaActiveViewCtr', 'plaActiveViewCpm', 'plaAverageCpc',
        'plaAverageCpe', 'plaAverageCpm', 'plaAverageCpv', 'plaCost',
        'histListLen'
    ]
    multivalue_features = [
        'locationName', 'languageCode', 'hist_BaseAdGroupId'
    ]
    sparse_features = ["BaseAdGroupId", "Criteria", 'placementType']
    target = ['Ctr']

    # 2. Missing value process.
    train[sparse_features +
          multivalue_features] = train[sparse_features +
                                       multivalue_features].fillna('-1', )
    train[dense_features + target] = train[dense_features + target].fillna(0, )
    test[sparse_features +
         multivalue_features] = test[sparse_features +
                                     multivalue_features].fillna('-1', )
    test[dense_features + target] = test[dense_features + target].fillna(0, )

    train["BaseAdGroupId"] = train["BaseAdGroupId"].apply(lambda x: str(
        (int(x))))

    test["BaseAdGroupId"] = test["BaseAdGroupId"].apply(lambda x: str(
        (int(x))))

    # 3. sparse features transformation
    for feat in sparse_features:
        lbe = LabelEncoder()
        train[feat] = lbe.fit_transform(train[feat])
        test[feat] = lbe.fit_transform(test[feat])

    # 4. dense features transformation
    for numFeature in dense_features:
        train[numFeature] = train[numFeature].apply(
            lambda x: x if x < 2 else math.sqrt(math.log(x)))
        test[numFeature] = test[numFeature].apply(
            lambda x: x if x < 2 else math.sqrt(math.log(x)))

    # 5. multivalue features transformation

    for feat in multivalue_features:
        exec(
            '{}_train_list = list([split(x,{}Dict) for x in train[feat].values])'
            .format(feat, feat, feat))
        exec(
            '{}_test_list = list([split(x,{}Dict) for x in test[feat].values])'
            .format(feat, feat, feat))

        exec('{}_length = np.array(list(map(len, {}_train_list)))'.format(
            feat, feat))
        exec('{}_maxlen = max({}_length)'.format(feat, feat))

        exec(
            '{}_train_list = pad_sequences({}_train_list, maxlen=multivalue_len, padding="post",)'
            .format(feat, feat, feat))
        exec(
            '{}_test_list = pad_sequences({}_test_list, maxlen=multivalue_len, padding="post",)'
            .format(feat, feat, feat))

    # 6. feature colums
    fixlen_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=(train[feat].append(
                       test[feat], ignore_index=True)).nunique(),
                   embedding_dim=embedding_dim)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    varlen_feature_columns = []
    for feat in multivalue_features:
        exec(
            'varlen_feature_columns.append(VarLenSparseFeat("{}", maxlen= multivalue_len,vocabulary_size=len({}Dict) + 1,embedding_dim=embedding_dim, combiner="mean",weight_name=None))'
            .format(str(feat), feat))

    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 7.generate input data for model
    train_model_input = {
        name: train[name]
        for name in sparse_features + dense_features
    }
    test_model_input = {
        name: test[name]
        for name in sparse_features + dense_features
    }

    for feat in multivalue_features:
        name = str(feat)
        exec('train_model_input["{}"] = {}_train_list'.format(name, feat))
        exec('test_model_input["{}"] = {}_test_list'.format(name, feat))

    behavior_feature_list = ["BaseAdGroupId"]
    return train_model_input, train, test_model_input, test, dnn_feature_columns, linear_feature_columns, behavior_feature_list
示例#21
0
    dense_features = [f for f in dense_features if f !='target']
    
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['target']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                           for feat in sparse_features] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.33)
    train_model_input = [train[name] for name in fixlen_feature_names]

    test_model_input = [test[name] for name in fixlen_feature_names]

    # 4.Define Model,train,predict and evaluate
示例#22
0
文件: flen.py 项目: losenineai/rs
                      site_id='context',
                      site_domain='context',
                      site_category='context',
                      app_id='item',
                      app_domain='item',
                      app_category='item',
                      device_model='user',
                      device_type='user',
                      device_conn_type='context',
                      hour='context',
                      device_id='user')

    fixlen_feature_columns = [
        SparseFeat(name,
                   vocabulary_size=data[name].nunique(),
                   embedding_dim=16,
                   use_hash=False,
                   dtype='int32',
                   group_name=field_info[name]) for name in sparse_features
    ]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
示例#23
0
    label_number.append(label_feature_number1)
    label_number.append(label_feature_number2)

    Y1 = data[target1].values
    Y2 = data[target2].values
    encoder = LabelEncoder()
    encoded_Y1 = encoder.fit_transform(Y1)
    encoded_Y2 = encoder.fit_transform(Y2)

    dummy_target1 = np_utils.to_categorical(encoded_Y1)
    dummy_target2 = np_utils.to_categorical(encoded_Y2)

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    train, test, target1_train, target1_test, target2_train, target2_test = train_test_split(
        data, dummy_target1, dummy_target2, test_size=0.4, random_state=0)
示例#24
0
    item_profile = data[["movie_id"]].drop_duplicates('movie_id')

    user_profile.set_index("user_id", inplace=True)
    #
    # user_item_list = data.groupby("user_id")['movie_id'].apply(list)

    train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer)

    train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)
    test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 32
    # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length
    user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                            SparseFeat("gender", feature_max_idx['gender'], 16),
                            SparseFeat("age", feature_max_idx['age'], 16),
                            SparseFeat("occupation", feature_max_idx['occupation'], 16),
                            SparseFeat("zip", feature_max_idx['zip'], 16),
                            VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN_short, 'mean',
                                             'short_sess_length'),
                            VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN_prefer, 'mean',
                                             'prefer_sess_length'),
                            VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim,
                                                        embedding_name="genres"), SEQ_LEN_short, 'mean',
                                             'short_sess_length'),
                            VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim,
                                                        embedding_name="genres"), SEQ_LEN_prefer, 'mean',
示例#25
0
    'label': 't_location',
    'len': t_loc_len,
    'map': locmap,
    'weight': None
})
var_info.append({
    'label': 'rs_channel',
    'len': rs_channel_len,
    'map': rschannlemap,
    'weight': None
})

# define model
emb_size = 32
fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=len(vocabs[feat]), embedding_dim=emb_size)
    for feat in sparse_features
] + [DenseFeat(
    feat,
    1,
) for feat in dense_features]
varlen_feature_columns = [
    VarLenSparseFeat(SparseFeat(item['label'],
                                vocabulary_size=len(item['map']) + 1,
                                embedding_dim=emb_size),
                     maxlen=item['len'],
                     combiner='mean',
                     weight_name=item['weight']) for item in var_info
]
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
示例#26
0
    data = pd.read_csv('./criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.do simple Transformation for dense features
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.set hashing space for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1000,embedding_dim=4, use_hash=True, dtype='string')  # since the input is string
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                          for feat in dense_features]

    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

from deepctr.inputs import SparseFeat, get_feature_names

#数据加载
# data = pd.read_csv("movielens_sample.txt")
data = pd.read_csv("movielens_sample_my.csv")

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [
    SparseFeat(feature, data[feature].nunique()) for feature in sparse_features
]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(
    "adam",
    "mse",
示例#28
0
def run(data, ziel, line0, grid , loop):
    poi_feature_transfer = []
    print('++++', '\n', grid)
    for a in range(len(poi_feature)):
        poi_feature_transfer.append('poi_feature_%d'%a)
        data = data.rename(columns={poi_feature[a]: 'poi_feature_%d'%a})

    features = ['provname', 'prefname', 'cntyname', 'townname', 'villname','dispincm', 'urbcode_1', 'hauslvl']  + poi_feature_transfer#
    sparse_features = []
    dense_features = []
    for f in features:
        if f not in x_category or x_category[f] == 1:
            dense_features.append(f)
        else:
            sparse_features.append(f)
    data[sparse_features] = data[sparse_features].fillna(-1)
    data[dense_features] = data[dense_features].fillna(0 )

    y=[]
    #ziel =  # villmean, income
    y_limit= [np.min(data[ziel])-1]+ line0 +[np.max(data[ziel])]
    for index, row in data.iterrows():
        for i in range(1, len(y_limit)):
            if y_limit[i - 1] < row[ziel] <= y_limit[i]:
                y.append(i-1)
                break
    data['income_0'] = y
    target = ['income_0']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \
                             [DenseFeat(feat, 1,)for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)

    # try to oversampling
   # (train_x,train_y)=over_sampling(train[features],train[ziel], 3)
   # train = (np.column_stack((train_x, train_y)))
    train_model_input = [train[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]
# 4.Define Model,train,predict and evaluate ##############################################
    (models, model_names,xlabel) = model_gridsearch(linear_feature_columns,dnn_feature_columns,grid)
    logloss, auc1, acc1, pre1, recall1,f11 = [],[],[],[],[],[]
    print(ziel, line0, len(data))
    for name,model in zip(model_names,models):
        ll_avg, auc_avg = [],[]
        for i in range(loop):
            model.compile("adam",'binary_crossentropy',
                          metrics=['binary_crossentropy'])
            history = model.fit(train_model_input, train[target].values,
                                batch_size=256, epochs=10, verbose=0, validation_split=0.2, )
            pred_ans = model.predict(test_model_input, batch_size=256)

            true = test[target].values
            '''
            f = open("pred.csv", 'a', encoding='utf_8_sig')
            f.write('%s\n'%(ziel))
            for i in range(len(pred_ans)):
                f.write('%s, %s\n' % (pred_ans[i],true[i] ))
            f.close()'''


            ll = round(log_loss(test[target].values, pred_ans), 4)
            auc = round(roc_auc_score(test[target].values, pred_ans), 4)
            #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4)
            #pre = round(precision_score(test[target].values, pred_ans.round()), 4)
            #recall = round(recall_score(test[target].values, pred_ans.round()), 4)
            #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
            print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
            ll_avg.append(ll), auc_avg.append(auc)
        logloss.append(np.mean(ll_avg)), auc1.append(np.mean(auc_avg))#, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1)

        '''
        cm = confusion_matrix(test[target].values, pred_ans.round())
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = []
        for m in range(len(line0)+1):
            cm.append([])
            for n in range(len(line0)+1):
                cm[m].append(round(cm_normalized[m][n],4))
        '''
        '''
        print(name)
        print("LogLoss", ll, end=' ')
        print("AUC", auc, end=' ')
        print("accuracy", acc, end=' ')
        #print("precision" , pre, end=' ')
        #print("recall", recall, end=' ')
        print("f1" , f1, end=' ')
        print("spec", spec, end=' ')
        print("sens" , sens, end=' ')
        print(cm)
        #f = open("DeepFM.csv", 'a', encoding='utf_8_sig')
        #f.write('%s,%s\n'%(ziel,line0))
        #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens))
        #f.write('%s\n' % str(cm).replace(',',';'))
        #f.close()
        '''
    return (logloss, auc1, xlabel)
示例#29
0
def test_PNN_avazu(data, train, test):

    print("\nTesting PNN on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_activation = dnn_activation, task='binary')
        model = PNN(dnn_feature_columns,
                    use_inner=True,
                    use_outter=False,
                    dnn_activation=dnn_activation,
                    task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_dropout = dnn_dropout, task='binary')
        model = PNN(dnn_feature_columns,
                    use_inner=True,
                    use_outter=False,
                    dnn_dropout=dnn_dropout,
                    task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_hidden_units = dnn_hidden_units, task='binary')
        model = PNN(dnn_feature_columns,
                    use_inner=True,
                    use_outter=False,
                    dnn_hidden_units=dnn_hidden_units,
                    task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(train_model_input,
                  train[target_label].values,
                  batch_size=256,
                  epochs=10,
                  verbose=0,
                  validation_split=TEST_PROPORTION)
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        # create_plots("OPNN", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list)
        # create_plots("OPNN", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list)
        # create_plots("OPNN", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list)
        create_plots("PNN", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("PNN", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("PNN", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)
示例#30
0
def _preprocess_movielens(df, **kw):
    multiple_value = kw.get('multiple_value')
    sparse_col = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_col:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat])

    if not multiple_value:
        # 2.count #unique features for each sparse field
        fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col]
        linear_cols = fixlen_cols
        dnn_cols = fixlen_cols
        train, test = train_test_split(df, test_size=0.2)
        ytrue = test[target].values
    else:
        ytrue = df[target].values
        hash_feature = kw.get('hash_feature', False)
        if not hash_feature:
            def split(x):
                key_ans = x.split('|')
                for key in key_ans:
                    if key not in key2index:
                        # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
                        key2index[key] = len(key2index) + 1
                return list(map(lambda x: key2index[x], key_ans))

            # preprocess the sequence feature
            key2index = {}
            genres_list = list(map(split, df['genres'].values))
            genres_length = np.array(list(map(len, genres_list)))
            max_len = max(genres_length)
            # Notice : padding=`post`
            genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
            fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col]

            use_weighted_sequence = False
            if use_weighted_sequence:
                varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
                    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
            else:
                varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
                    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

            linear_cols = fixlen_cols + varlen_cols
            dnn_cols = fixlen_cols + varlen_cols

            # generate input data for model
            model_input = {name: df[name] for name in sparse_col}  #
            model_input["genres"] = genres_list
            model_input["genres_weight"] = np.random.randn(df.shape[0], max_len, 1)


        else:
            df[sparse_col] = df[sparse_col].astype(str)

            # 1.Use hashing encoding on the fly for sparse features,and process sequence features
            genres_list = list(map(lambda x: x.split('|'), df['genres'].values))
            genres_length = np.array(list(map(len, genres_list)))
            max_len = max(genres_length)

            # Notice : padding=`post`
            genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)

            # 2.set hashing space for each sparse field and generate feature config for sequence feature
            fixlen_cols = [
                SparseFeat(feat, df[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
                for feat in sparse_col]
            varlen_cols = [
                VarLenSparseFeat(
                    SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
                    maxlen=max_len, combiner='mean',
                )]  # Notice : value 0 is for padding for sequence input feature

            linear_cols = fixlen_cols + varlen_cols
            dnn_cols = fixlen_cols + varlen_cols
            feature_names = get_feature_names(linear_cols + dnn_cols)

            # 3.generate input data for model
            model_input = {name: df[name] for name in feature_names}
            model_input['genres'] = genres_list

        train, test = model_input, model_input

    return df, linear_cols, dnn_cols, train, test, target, ytrue