def test_WDL(sparse_feature_num, wide_feature_num): model_name = "WDL" sample_size = SAMPLE_SIZE x, y, feature_columns = get_test_data(sample_size, sparse_feature_num, sparse_feature_num) model = WDL(feature_columns, feature_columns, dnn_hidden_units=[32, 32], dnn_dropout=0.5) check_model(model, model_name, x, y)
def test_WDL(sparse_feature_num, dense_feature_num): if version.parse(tf.__version__) >= version.parse('2.0.0'): return model_name = "WDL" sample_size = SAMPLE_SIZE x, y, feature_columns = get_test_data(sample_size, sparse_feature_num=sparse_feature_num, dense_feature_num=dense_feature_num, hash_flag=True) model = WDL(feature_columns, feature_columns, dnn_hidden_units=[4, 4], dnn_dropout=0.5) check_model(model, model_name, x, y)
def test_WDL(): name = "WDL" sample_size = 64 feature_dim_dict = { 'sparse': { 'sparse_1': 2, 'sparse_2': 5, 'sparse_3': 10 }, 'dense': ['dense_1', 'dense_2', 'dense_3'] } sparse_input = [ np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values() ] dense_input = [ np.random.random(sample_size) for name in feature_dim_dict['dense'] ] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input model = WDL(feature_dim_dict, feature_dim_dict, hidden_size=[32, 32], keep_prob=0.5) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x + x, y, batch_size=100, epochs=1, validation_split=0.5) print(name + " test train valid pass!") model.save_weights(name + '_weights.h5') model.load_weights(name + '_weights.h5') print(name + "test save load weight pass!") save_model(model, name + '.h5') model = load_model(name + '.h5', custom_objects) print(name + "test save load model pass!") print(name + " test pass!")
def test_WDL(sparse_feature_num, wide_feature_num): model_name = "WDL" sample_size = SAMPLE_SIZE feature_dim_dict = {"sparse": [], 'dense': []} wide_feature_dim_dict = {"sparse": [], 'dense': []} for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): if name == "sparse": for i in range(num): feature_dim_dict[name].append( SingleFeat(name + '_' + str(i), np.random.randint(1, 10))) else: for i in range(num): feature_dim_dict[name].append( SingleFeat(name + '_' + str(i), 0)) for name, num in zip(["sparse", "dense"], [wide_feature_num, wide_feature_num]): if name == "sparse": for i in range(num): wide_feature_dim_dict[name].append( SingleFeat(name + 'wide_' + str(i), np.random.randint(1, 10))) else: for i in range(num): wide_feature_dim_dict[name].append( SingleFeat(name + 'wide_' + str(i), 0)) sparse_input = [ np.random.randint(0, feat.dimension, sample_size) for feat in feature_dim_dict['sparse'] ] dense_input = [ np.random.random(sample_size) for _ in feature_dim_dict['dense'] ] wide_sparse_input = [ np.random.randint(0, feat.dimension, sample_size) for feat in wide_feature_dim_dict['sparse'] ] wide_dense_input = [ np.random.random(sample_size) for _ in wide_feature_dim_dict['dense'] ] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input x_wide = wide_sparse_input + wide_dense_input model = WDL(feature_dim_dict, wide_feature_dim_dict, dnn_hidden_units=[32, 32], dnn_dropout=0.5) check_model(model, model_name, x + x_wide, y)
def test_WDL(sparse_feature_num, wide_feature_num): model_name = "WDL" sample_size = 64 feature_dim_dict = {"sparse": {}, 'dense': []} wide_feature_dim_dict = {"sparse": {}, 'dense': []} for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): if name == "sparse": for i in range(num): feature_dim_dict[name][name + '_' + str(i)] = np.random.randint(1, 10) else: for i in range(num): feature_dim_dict[name].append(name + '_' + str(i)) for name, num in zip(["sparse", "dense"], [wide_feature_num, wide_feature_num]): if name == "sparse": for i in range(num): wide_feature_dim_dict[name][name + 'wide_' + str(i)] = np.random.randint(1, 10) else: for i in range(num): wide_feature_dim_dict[name].append(name + 'wide_' + str(i)) sparse_input = [ np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values() ] dense_input = [ np.random.random(sample_size) for name in feature_dim_dict['dense'] ] wide_sparse_input = [ np.random.randint(0, dim, sample_size) for dim in wide_feature_dim_dict['sparse'].values() ] wide_dense_input = [ np.random.random(sample_size) for name in wide_feature_dim_dict['dense'] ] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input x_wide = wide_sparse_input + wide_dense_input model = WDL(feature_dim_dict, wide_feature_dim_dict, hidden_size=[32, 32], keep_prob=0.5) check_model(model, model_name, x + x_wide, y)
def custom_model(): sparse_features = ["C" + str(i) for i in range(1, 27)] dense_features = ["I" + str(i) for i in range(1, 14)] fixlen_feature_columns = [ SparseFeat( feat, vocabulary_size=10000, embedding_dim=4, dtype="string", use_hash=True, ) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] model = WDL(fixlen_feature_columns, fixlen_feature_columns, task="binary") return model
def find_l2_reg_embedding(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['l2_reg_embedding', 'RMSE', 'MAE', 'MSE', 'AUC'] df_result = pd.DataFrame(columns=cols, index=range( len(config.param_rand['l2_reg_embedding']))) for i, x in enumerate(config.param_rand['l2_reg_embedding']): ##Add dnn_hidden_units as b later model = WDL( linear_feature_columns, dnn_feature_columns, # ADD LATER dnn_hidden_units=(2, 2), l2_reg_linear=0.1, l2_reg_embedding=x, l2_reg_dnn=0, init_std=0.0001, seed=1024, task='binary') model.compile("adam", "mse", metrics=['mse']) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[i].l2_reg_embedding = x df_result.loc[i].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[i].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[i].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[i].AUC = np.round(auc, 3) return df_result
def widendeep_model(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score'] df_result = pd.DataFrame(columns=cols, index=range(1)) model = WDL( linear_feature_columns, dnn_feature_columns, dnn_hidden_units=config.widendeep_att["dnn_hidden_units"], #l2_reg_linear=config.widendeep_att["l2_reg_linear"], # l2_reg_embedding=config.widendeep_att["l2_reg_embedding"], #l2_reg_dnn=config.widendeep_att["l2_reg_dnn"], # init_std=config.widendeep_att["init_std"], dnn_dropout=config.widendeep_att['dnn_dropout'], dnn_activation=config.widendeep_att['dnn_activation'], seed=config.widendeep_att["seed"], task=config.widendeep_att["task"]) model.compile("adam", "mse", metrics=['mse']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2) pred_ans = model.predict(test_model_input, batch_size=256) save_model(model, 'saved_widendeep.h5') # save_model auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[0].model = "Wide and Deep" df_result.loc[0].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[0].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[0].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[0].AUC = np.round(auc, 3) return df_result
target = ['rating'] # 对特征标签进行编码 for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) # 计算每个特征中的 不同特征值的个数 fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name:train[name].values for name in feature_names} test_model_input = {name:test[name].values for name in feature_names} # 使用WDL进行训练 model = WDL(linear_feature_columns, dnn_feature_columns, task='regression') model.compile("adam", "mse", metrics=['mse'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=100, verbose=True, validation_split=0.2, ) plt.figure() x = range(len(history.history['loss'])) plt.plot(x, history.history['loss']) plt.title('loss') # 使用WDL进行预测 pred_ans = model.predict(test_model_input, batch_size=256) # 输出RMSE或MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse ** 0.5 print("test RMSE", rmse)
dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = WDL(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(400, 400, 400), dnn_dropout=0.5) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2,
fix_len_feature_columns = [ SparseFeat(feature, data[feature].nunique()) for feature in sparse_features ] linear_feature_columns = fix_len_feature_columns dnn_feature_columns = fix_len_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_set = {name: train[name].values for name in feature_names} test_set = {name: test[name].values for name in feature_names} # 使用 WDL 进行训练 model = WDL(linear_feature_columns, dnn_feature_columns, task='regression') model.compile('adam', 'mse', metrics=['mse']) history = model.fit(train_set, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2) # 使用 WDL 进行预测 pred_ans = model.predict(test_set, batch_size=256) # 输出 RMSE 或者 MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse**0.5 print(f'test rmse: {rmse}')
if model_type == "xDeepFM": model = xDeepFM( linear_feature_columns, dnn_feature_columns, task="binary", embedding_size=emb_dim, dnn_hidden_units=[400, 400], cin_layer_size=[200, 200, 200], ) if model_type == "WDL": model = WDL( linear_feature_columns, dnn_feature_columns, task="binary", embedding_size=emb_dim, dnn_hidden_units=[1024, 512, 256], ) if model_type == "DCN": model = DCN( dnn_feature_columns, task="binary", embedding_size=emb_dim, dnn_hidden_units=[1024, 1024], cross_num=6, ) if opt == "adagrad": optimizer = Adagrad
for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = { feat: data[feat].nunique() for feat in sparse_features } # 3.generate input data for model model_input = [data[feat].values for feat in sparse_feature_dim] if mode == 'train': # 4.Define Model,compile and train model = WDL({ "sparse": sparse_feature_dim, "dense": [] }, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy']) filepath = 'model_save/wdl_sample-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') history = model.fit(
length_name='seq_length') ] behavior_feature_list = ['itemId', 'category'] if sys.argv[1] == 'DeepFM_UDG': model = DeepFM_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'DeepFM': model = DeepFM(linear_feature_columns, dnn_feature_columns, [], (200, 80)) elif sys.argv[1] == 'PNN_UDG': model = PNN_UDG(dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'PNN': model = PNN(dnn_feature_columns, untrainable_features_columns, (200, 80)) elif sys.argv[1] == 'WDL': model = WDL(linear_feature_columns, dnn_feature_columns, [], (200, 80)) elif sys.argv[1] == 'WDL_UDG': model = WDL_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'DIEN': model = DIEN(fixlen_feature_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True) elif sys.argv[1] == 'DIEN_UDG': model = DIEN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'DIN': model = DIN(fixlen_feature_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0) elif sys.argv[1] == 'DIN_UDG': model = DIN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) if sys.argv[4] == 'focal': model.compile("adam", loss=focal_loss, metrics=['binary_crossentropy'], ) else:
train_model_input['genres'] = genres_list[:len(train), :] test_model_input['genres'] = genres_list[len(train):, :] target = ['rating'] # callback from tensorflow.keras.callbacks import EarlyStopping, TensorBoard callbacks = [EarlyStopping(monitor='val_loss', patience=3, min_delta=1e-2)] # 6,建立模型 model = WDL( linear_feature_columns, dnn_feature_columns, task='regression', l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_dnn=0.01, init_std=0.0001, dnn_hidden_units=(256, 128), seed=1024, dnn_dropout=0, dnn_activation='relu', ) model.summary() # 可以进行调优 from tensorflow.keras.optimizers import Adam optmizer = Adam(1e-4) model.compile( optmizer, "mse", metrics=['mse'],