def plot_nfm(): # 读取数据 data, dense_features, sparse_features = read_criteo_data() dense_features = dense_features[:3] sparse_features = sparse_features[:2] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建NFM模型 history = NFM(linear_feature_columns, dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/NFM.png", show_shapes=True)
def plot_dien(): """读取数据""" samples_data = pd.read_csv("data/movie_sample.txt", sep="\t", header=None) samples_data.columns = [ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label" ] """数据集""" X = samples_data[[ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id" ]] y = samples_data["label"] """特征封装""" feature_columns = [ SparseFeat('user_id', max(samples_data["user_id"]) + 1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"]) + 1, embedding_dim=8), SparseFeat('age', max(samples_data["age"]) + 1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"]) + 1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"]) + 1, embedding_dim=8), DenseFeat('hist_len', 1) ] feature_columns += [ VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] feature_columns += [ VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] # 负采样序列特征 neg_seq_feature_list = ['neg_hist_movie_id'] """构建DIN模型""" history = DIEN(feature_columns, behavior_feature_list, behavior_seq_feature_list, neg_seq_feature_list, use_neg_sample=True) keras.utils.plot_model(history, to_file="./imgs/DIEN.png", show_shapes=True)
def plot_ncf(): # 读取数据,NCF使用的特征只有user_id和item_id rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] data = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', engine='python', names=rnames) lbe = LabelEncoder() data['user_id'] = lbe.fit_transform(data['user_id']) data['movie_id'] = lbe.fit_transform(data['movie_id']) dnn_feature_columns = [ SparseFeat('user_id', data['user_id'].nunique(), 8), SparseFeat('movie_id', data['movie_id'].nunique(), 8) ] # 构建FM模型 history = NCF(dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/NCF.png", show_shapes=True)
def plot_din(): # 读取数据 samples_data = pd.read_csv("./data/movie_sample.txt", sep="\t", header=None) samples_data.columns = [ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label" ] feature_columns = [ SparseFeat('user_id', max(samples_data["user_id"]) + 1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"]) + 1, embedding_dim=8), SparseFeat('age', max(samples_data["age"]) + 1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"]) + 1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"]) + 1, embedding_dim=8), DenseFeat('hist_len', 1) ] feature_columns += [ VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] history = DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list) keras.utils.plot_model(history, to_file="./imgs/DIN.png", show_shapes=True)
def plot_pnn(): data, dense_features, sparse_features = read_criteo_data() dense_features = dense_features[:3] sparse_features = sparse_features[:3] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建DeepCrossing模型 history = PNN(dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/PNN.png", show_shapes=True)
if __name__ == "__main__": # 读取数据 data = pd.read_csv('../data/criteo_sample.txt') # 划分dense和sparse特征 columns = data.columns.values dense_features = [feat for feat in columns if 'I' in feat] sparse_features = [feat for feat in columns if 'C' in feat] # 简单的数据预处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label'] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建xDeepFM模型
"""构建DIEN模型的输入格式""" # 这里和DIN相比, 会多出负采样的一列历史行为 X_train = {"user_id": np.array(X["user_id"]), \ "gender": np.array(X["gender"]), \ "age": np.array(X["age"]), \ "hist_movie_id": np.array([[int(i) for i in l.split(',')] for l in X["hist_movie_id"]]), \ "neg_hist_movie_id": np.array([[int(i) for i in l.split(',')] for l in X["neg_hist_movie_id"]]), \ "hist_len": np.array(X["hist_len"]), \ "movie_id": np.array(X["movie_id"]), \ "movie_type_id": np.array(X["movie_type_id"])} y_train = np.array(y) """特征封装""" feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8), SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8), DenseFeat('hist_len', 1)] feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)] feature_columns += [VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] # 负采样序列特征 neg_seq_feature_list = ['neg_hist_movie_id']
if __name__ == "__main__": # 读取数据 data = pd.read_csv('./data/criteo_sample.txt') # 划分dense和sparse特征 columns = data.columns.values dense_features = [feat for feat in columns if 'I' in feat] sparse_features = [feat for feat in columns if 'C' in feat] # 简单的数据预处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label'] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] # 构建DCN模型 history = DCN(linear_feature_columns, dnn_feature_columns) history.summary() history.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')]) # 将输入数据转化成字典的形式输入
return model if __name__ == "__main__": # 读取数据,NCF使用的特征只有user_id和item_id rnames = ['user_id','movie_id','rating','timestamp'] data = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', engine='python', names=rnames) lbe = LabelEncoder() data['user_id'] = lbe.fit_transform(data['user_id']) data['movie_id'] = lbe.fit_transform(data['movie_id']) train_data = data[['user_id', 'movie_id']] train_data['label'] = data['rating'] dnn_feature_columns = [SparseFeat('user_id', train_data['user_id'].nunique(), 8), SparseFeat('movie_id', train_data['movie_id'].nunique(), 8)] # 构建FM模型 history = NCF(dnn_feature_columns) history.summary() # 因为数据目前只有用户点击的数据,没有用户未点击的movie,所以这里不能用于做ctr预估 # 如果需要做ctr预估需要给用户点击和未点击的movie打标签,这里就先预测用户评分 history.compile(optimizer="adam", loss="mse", metrics=['mae']) # 将输入数据转化成字典的形式输入 # 将数据转换成字典的形式,用于Input()层对应 train_model_input = {name: train_data[name] for name in ['user_id', 'movie_id', 'label']} # 模型训练 history.fit(train_model_input, train_data['label'].values,