def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def test_long_dense_vector(): #构造特征 feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) #构造样本 user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] #创建模型model model = DeepFM(feature_columns, feature_columns[:-1]) # model.summary() #tf.keras.utils.plot_model(model, "test_compu") #训练模型 model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def run_deepfm_model(): train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target = read_data_as_model( ) #Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) return pred_ans, test[target].values, round( roc_auc_score(test[target].values, pred_ans), 4), 'deepfm'
def test_DeepFM(use_fm, hidden_size, sparse_feature_num): model_name = "DeepFM" sample_size = 64 feature_dim_dict = {"sparse": {}, 'dense': []} for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): if name == "sparse": for i in range(num): feature_dim_dict[name][name + '_' + str(i)] = np.random.randint(1, 10) else: for i in range(num): feature_dim_dict[name].append(name + '_' + str(i)) sparse_input = [np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values()] dense_input = [np.random.random(sample_size) for name in feature_dim_dict['dense']] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input model = DeepFM(feature_dim_dict, use_fm=use_fm, hidden_size=hidden_size, keep_prob=0.5, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5) print(model_name+" test train valid pass!") model.save_weights(model_name + '_weights.h5') model.load_weights(model_name + '_weights.h5') print(model_name+" test save load weight pass!") save_model(model, model_name + '.h5') model = load_model(model_name + '.h5', custom_objects) print(model_name + " test save load model pass!") print(model_name + " test pass!")
def train_deepFM(): k = featureengineer.k #缺失值填充+编码处理 data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', ) for feat in trainmodel.dense_features: data[feat].fillna(data[feat].dropna().mean(), inplace=True) for feat in trainmodel.sparse_features: data[feat] = data[feat].apply(lambda x:str(x)) lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features]) #数据格式转换 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8) for i, feat in enumerate(trainmodel.sparse_features)] + \ [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features] lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1) for i, feat in enumerate(trainmodel.lgbOut_Features)] key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums} varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i in trainmodel.var_features] dnn_feature_columns = fixlen_feature_columns + varlen_features linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in sparse_dense_features} test_model_input = {name: test[name] for name in sparse_dense_features} for x in trainmodel.var_features: if x == 'applist': train_model_input[x] = np.array(train[x].tolist()) test_model_input[x] = np.array(test[x].tolist()) if x == 'new_tag': train_model_input[x] = np.array(train[x].tolist())-appsnum test_model_input[x] = np.array(test[x].tolist())-appsnum # 模型 model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True, task='binary') model.compile("adam", "binary_crossentropy",metrics=['AUC'], ) history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def main(dataPath, dataPath_val, batch_size): # must have list of training files files = glob.glob(dataPath + "/*.csv")[::5] # validation files files_val = glob.glob(dataPath_val + "/*.csv")[::5] # Count number of examples in training data nexs = get_total_examples(files) print("Number of training examples: ", nexs) nexs_val = get_total_examples(files_val) print("Number of validation examples: ", nexs_val) # Create data generator train_gen = DataGenerator(files, nexs, batch_size=batch_size) val_gen = DataGenerator(files_val, nexs_val, batch_size=batch_size) linear_feature_columns = train_gen.linear_feature_columns dnn_feature_columns = train_gen.dnn_feature_columns # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.0) model.compile( optimizer, "binary_crossentropy", metrics=['binary_crossentropy', auroc], ) pbar = ProgbarLogger(count_mode='steps', stateful_metrics=None) weights_file = "model-5-lr0p001.h5" model_checkpoint = ModelCheckpoint(weights_file, monitor="val_binary_crossentropy", save_best_only=True, save_weights_only=True, verbose=1) history = model.fit_generator(train_gen, epochs=10, verbose=1, steps_per_epoch=nexs / batch_size, validation_data=val_gen, validation_steps=nexs / batch_size, callbacks=[model_checkpoint])
def model_generate(train_X, train_y, val_X, val_y, linear_feature_columns, dnn_feature_columns): model = DeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=32) model.compile("adam", "binary_crossentropy", metrics=[roc_auc_score_pyfunc, log_loss_pyfunc]) history = model.fit(train_X, train_y, validation_data=(val_X, val_y), batch_size=4096, epochs=5, callbacks=[EarlyStopping()]) return model, history
def train_model(train, test, linear_feature, dnn_feature): model = DeepFM(linear_feature, dnn_feature, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['AUC'], ) history = model.fit( *train, batch_size=512, epochs=5, verbose=2, validation_split=0.1, ) pred_ans = model.predict(test[0], batch_size=512) print("test LogLoss", round(log_loss(test[1], pred_ans), 4)) print("test AUC", round(roc_auc_score(test[1], pred_ans), 4))
def test_DeepFM(use_fm, hidden_size): name = "DeepFM" sample_size = 64 feature_dim_dict = { 'sparse': { 'sparse_1': 2, 'sparse_2': 5, 'sparse_3': 10 }, 'dense': ['dense_1', 'dense_2', 'dense_3'] } sparse_input = [ np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values() ] dense_input = [ np.random.random(sample_size) for name in feature_dim_dict['dense'] ] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input model = DeepFM( feature_dim_dict, use_fm=use_fm, hidden_size=hidden_size, keep_prob=0.5, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5) print(name + " test train valid pass!") model.save_weights(name + '_weights.h5') model.load_weights(name + '_weights.h5') print(name + " test save load weight pass!") save_model(model, name + '.h5') model = load_model(name + '.h5', custom_objects) print(name + " test save load model pass!") print(name + " test pass!")
def deepfm_model(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score'] df_result = pd.DataFrame(columns=cols, index=range(1)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=config.deepfm_att["dnn_hidden_units"], init_std=config.deepfm_att["init_std"], seed=config.deepfm_att["seed"], dnn_dropout=config.deepfm_att["dnn_dropout"], dnn_activation=config.deepfm_att["dnn_activation"], task=config.deepfm_att["task"], fm_group=config.deepfm_att["fm_group"], dnn_use_bn=config.deepfm_att["dnn_use_bn"]) model.compile("adam", "mse", metrics=['mse']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2) pred_ans = model.predict(test_model_input, batch_size=256) save_model(model, 'saved_deepfm.h5') # save_model auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[0].model = "DeepFM" df_result.loc[0].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[0].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[0].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[0].AUC = np.round(auc, 3) #df_result.loc[0].score=(1/df_result.iloc[0]['RMSE'])*(1/df_result.iloc[0]['MAE'])*(2*df_result.iloc[0]['AUC']) return df_result
target = ['transportation_issues'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].nunique(), embedding_dim=size_dict[field_info[feat]], dtype='int32', group_name=field_info[feat]) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train, test = train_test_split(train_data, test_size=0.2, random_state=2020) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(2, 256), dnn_dropout=0.0) opt = keras.optimizers.Adam(learning_rate=0.001) model.compile(loss="binary_crossentropy", metrics=['binary_crossentropy'], optimizer=opt) history = model.fit(train_model_input, train[target].values, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight={0:1, 1:3}) pred_ans = model.predict(test_model_input, batch_size=64) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) ans = pd.Series(pred_ans.reshape((-1,))) ans[ans>=0.5] = 1 ans[ans<0.5] = 0 pd.Series(test[target].transportation_issues).value_counts() ans.value_counts() print(classification_report(test[target], ans))
from deepctr.models import DeepFM if __name__ == "__main__": data = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat].values for feat in sparse_feature_dim] test_model_input = [test[feat].values for feat in sparse_feature_dim] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feature_dim, "dense": []}, final_activation='linear') model.compile("adam", "mse", metrics=['mse'],) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2,) pred_ans = model.predict(test_model_input, batch_size=256) print("test MSE", round(mean_squared_error( test[target].values, pred_ans), 4))
lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features} dense_feature_list = dense_features # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat].values for feat in sparse_feature_dict] + \ [train[feat].values for feat in dense_feature_list] test_model_input = [test[feat].values for feat in sparse_feature_dict] + \ [test[feat].values for feat in dense_feature_list] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
feats = [i for i in data.columns if i != 'Rating'] X = data[feats] y = data['Rating'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sparse_features = [ 'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday' ] dense_features = ['hour', 'Age'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \ [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass') model.compile('adam', 'mse', metrics=['accuracy']) feature_names = get_feature_names(fixlen_feature_columns) train_feed_dict = {name: X_train[name] for name in feature_names} test_feed_dict = {name: X_test[name] for name in feature_names} model.fit(train_feed_dict, y_train, batch_size=256, epochs=10, validation_split=0.2) pred_ans = model.predict(test_feed_dict, batch_size=256)
l2_reg_dnn=NNconfig_dic["l2_reg_dnn"], l2_reg_embedding=NNconfig_dic["l2_reg_embedding"], l2_reg_linear=NNconfig_dic["l2_reg_linear"], dnn_dropout=NNconfig_dic["dnn_dropout"], dnn_use_bn=NNconfig_dic["dnn_use_bn"], dnn_activation=NNconfig_dic["dnn_activation"]) NNconfig_dic["model_name"] = "DeepFM" # %% opt = tf.keras.optimizers.Adam(learning_rate=NNconfig_dic["lr"]) NNconfig_dic["optimizer"] = "Adam" # %% model.compile(optimizer=opt, loss=tf.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.AUC()]) log_dir="logs"+ os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \ + datetime.now().strftime("%Y%m%d-%H%M%S") NN_config_path = "logs" + os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \ + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + "NNconfig.json" # if worker_index == 0: # if not os.path.exists("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res"): # os.makedirs("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res") # with open(NN_config_path, "w+") as conf: # json.dump(NNconfig_dic, conf) callbacks = [tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch = 3),]
dnn_feature_columns, task="binary", embedding_size=emb_dim, dnn_hidden_units=[1024, 1024], cross_num=6, ) if opt == "adagrad": optimizer = Adagrad elif opt == "adam": optimizer = Adam else: raise ValueError("Invalid optimizer") model.compile(optimizer(learning_rate), "binary_crossentropy", metrics=["binary_crossentropy"]) callbacks = [] patience_counter = 0 best_valid_loss = float("Inf") history_epoch = {} history_val = {} for epoch in range(epochs): breakout = False history_epoch[epoch] = {} history_val[epoch] = [] train_generator = get_data_generator( base_path,
# 2.count #unique features for each sparse field,and record dense feature field name sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features] dense_feature_list = [SingleFeat(feat, 0,) for feat in dense_features] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feature_list, "dense": dense_feature_list}, task='binary', embedding_size=4, dnn_hidden_units=(64, 64)) # parallel_model = multi_gpu_model(model, gpus=2) model.compile(tf.keras.optimizers.Adam(1e-4), "binary_crossentropy", metrics=['binary_crossentropy'], ) tensorboard = TensorBoard(log_dir="logs/DeepFM") history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, callbacks=[tensorboard]) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
class DeepModel: def __init__(self, model_name, model_architecture="DeepFM"): self.model_name = model_name self.model_architecture = model_architecture self.model = None self.history = None self.data = None self.callbacks = [] # requires tf2 # def set_notebook_mode(self): # progress_bar_cb = tfa.callbacks.TQDMProgressBar() #TQDMNotebookCallback(leave_inner=True, leave_outer=True) # self.callbacks.append(progress_bar_cb) def prepare_data(self, data_source, sparse_features, target, test_size=0.1): self.data = Data(sparse_features, target, data_format="deepctr", test_size=test_size) self.data.ingest(data_source) self.data.prepare() def build(self, task): assert task in ['regression', 'binary'] if self.model_architecture == "DeepFM": self.model = DeepFM( self.data.linear_feature_columns, self.data.dnn_feature_columns, task=task, ) else: raise NotImplementedError( 'At the current stage of the development, only a DeepFM is supported' ) task_attr = { 'regression': { 'loss': 'mse', 'metrics': 'mse' }, 'binary': { 'loss': 'binary_crossentropy', 'metrics': 'accuracy' } } if task == "regression": loss = "mse" metrics = "mse" elif task == "binary": loss = "binary_crossentropy" metrics = "accuracy" self.model.compile(optimizer="adam", loss=task_attr[task]['loss'], metrics=task_attr[task]['metrics']) def train(self, batch_size=256, epochs=10, validation_split=0.1): #class_weights = class_weight.compute_class_weight( # "balanced", np.unique(self.data.y_train[:, 0]), self.data.y_train[:, 0] #) self.history = self.model.fit( self.data.X_train, self.data.y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, verbose=2, #class_weight=class_weights, callbacks=self.callbacks, ) def evaluate(self): self.model.evaluate(self.data.X_test, self.data.y_test, batch_size=4096) def prepare_input(self, df): df = df.copy() for feat in self.data.sparse_features: lbe = self.data.encoders[feat] df[feat] = lbe.transform(df[feat]) X = {name: df[name].values for name in self.data.feature_names} return X def predict(self, X, batch_size=256): return self.model.predict(X, batch_size=batch_size)
print(behavior_feature_list) # model = DIN( dnn_feature_columns, behavior_feature_list,task='multiclass', dnn_hidden_units=(256, 128), # l2_reg_embedding=0.00001, dnn_dropout=0.5, l2_reg_dnn=0.0001,nClass=4) # model = LR(linear_feature_columns, l2_reg_linear=0.0001, init_std=0.0001, seed=1024, task='multiclass',nClass=4) model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass', dnn_hidden_units=(256, 128), l2_reg_embedding=0.0001, l2_reg_linear=0.0001, dnn_dropout=0.5, l2_reg_dnn=0.0001, nClass=4) model.compile(optimizer=keras.optimizers.Adam(0.0001), loss="categorical_crossentropy", metrics=['acc']) history = model.fit(train_model_input, y_true, callbacks=[ keras.callbacks.EarlyStopping(monitor='loss', patience=5), keras.callbacks.ModelCheckpoint("MCDIN.h5", monitor="loss", verbose=1, save_best_only=True) ], batch_size=1024, epochs=200, verbose=2) pred_ans = model.predict(test_model_input, batch_size=256)
def test_DFM_avazu(data, train, test): print("\nTesting DFM on avazu dataset...\n") results_activation_function = {"auc": [], "logloss": [], "rmse": []} results_dropout = {"auc": [], "logloss": [], "rmse": []} results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []} auc = 0 logloss = 0 rmse = 0 features_labels = train.columns sparse_features_labels = features_labels[1:23] target_label = features_labels[0] dnn_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] linear_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} true_y = test[target_label].values print("\t\t-- ACTIVATION FUNCTIONS --\t\t") for dnn_activation in dnn_activation_list: print("\nTesting {dnn_activation}...".format( dnn_activation=dnn_activation)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_activation=dnn_activation, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_activation_function["auc"].append(auc) results_activation_function["logloss"].append(logloss) results_activation_function["rmse"].append(rmse) print("\t\t-- DROPOUT RATES --\t\t") for dnn_dropout in dnn_dropout_list: print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_dropout=dnn_dropout, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_dropout["auc"].append(auc) results_dropout["logloss"].append(logloss) results_dropout["rmse"].append(rmse) print("\t\t-- HIDDEN UNITS --\t\t") for dnn_hidden_units in dnn_hidden_units_list: print("\nTesting {dnn_hidden_units}...".format( dnn_hidden_units=dnn_hidden_units)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=dnn_hidden_units, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_number_of_neurons["auc"].append(auc) results_number_of_neurons["logloss"].append(logloss) results_number_of_neurons["rmse"].append(rmse) if PLOT: create_plots("DFM", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list) create_plots("DFM", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list) create_plots("DFM", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list)
def deepctr_cv(X_train, y_train, folds, logger, cv_path, X_test=None, optional_data=None, prep=True, split_conf=None): scores = [] preds = [] meta = np.zeros_like(y_train).astype("float64") if split_conf is None: X_tr, X_te, main_conf, _ = prep_for_embedding(X_train, X_test, conf, prep=prep) X_train, X_test = X_tr, X_te else: main_conf = split_conf cat_cols = [c for c, _, _ in main_conf[0]] cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]] num_fs = [SingleFeat(c, 0) for c in conf.num_cols] X_test = split_df(X_test, cat_cols, conf.num_cols) for num_fold, (tr_ind, tes_ind) in enumerate(folds): if num_fold > 0: break logger.info(f"fold_{num_fold}") fold_path = cv_path / f"fold{num_fold}" seed_path = fold_path Path(fold_path).mkdir(exist_ok=True, parents=True) callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))] X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind] y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind] X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols) X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols) model = DeepFM({ 'sparse': cat_fs, 'dense': num_fs }, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['accuracy']) model.fit(X_cv_train, y_cv_train, callbacks=callbacks, batch_size=2048, epochs=10, verbose=1, validation_data=(X_cv_test, y_cv_test)) model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5') gc.collect() if X_test is not None: pred = model.predict(X_test, batch_size=2048) pred = pred[:, 0] np.save(seed_path / f"pred.npy", pred) train_oof = model.predict(X_cv_test, batch_size=2048) train_oof = train_oof[:, 0] auc = roc_auc_score(y_cv_test.values, train_oof) logger.info(f"{num_fold}: auc {auc}") np.save(seed_path / f"train_oof.npy", train_oof) # auc = roc_auc_score(y_cv_test, train_oof) # logger.info(f"seed_average: auc {auc}") scores.append(auc) np.save(fold_path / f"tes_ind.npy", tes_ind) meta[tes_ind] += train_oof del X_cv_train, y_cv_train, X_cv_test, y_cv_test if X_test is not None: preds.append(pred) scores = np.array(scores) preds = np.array(preds) pred = rank_average(preds) logger.info(f"{scores.mean()}, {scores.std()}") return scores, pred, meta
class DeepFMHelper: def __init__(self): self.min_max_scaler = MinMaxScaler(feature_range=(0, 1)) self.cat_features = [ "user_Вид тура_last", "user_Звездность_last", "tour_Страна", "tour_Страна тура", "user_Тип заявки_last", ] self.dense_features = None self.fixlen_feature_columns = None self.feature_names = None self.model = None def fit(self, X, y): X_ = X.copy() self.dense_features = list(X_.columns.difference(self.cat_features)) logger.debug("MinMaxScaler") self.min_max_scaler.fit(X_[self.dense_features]) X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) self._column_mapping(X_) X_.columns = [self.columns_mapping[col] for col in X_.columns] self.fixlen_feature_columns = [ SparseFeat( self.columns_mapping[feat], vocabulary_size=X_[self.columns_mapping[feat]].max() + 1, embedding_dim=4, ) for i, feat in enumerate(self.cat_features) ] + [ DenseFeat( self.columns_mapping[feat], 1, ) for feat in self.dense_features ] self.feature_names = get_feature_names(self.fixlen_feature_columns) logger.debug("Compile DeepFM model") self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) logger.debug("Fit DeepFM") train_model_input = { name: X_[name].values for name in self.feature_names } self.model.fit( train_model_input, y, batch_size=256, epochs=3, verbose=2, validation_split=0.2, ) def predict_proba(self, X): X_ = X.copy() X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) X_.columns = [self.columns_mapping[col] for col in X_.columns] model_input = {name: X_[name].values for name in self.feature_names} pred = self.inference(model_input) pred = pred[:, 0].numpy() return pred def _column_mapping(self, X): symbols = ( "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA", ) tr = {ord(a): ord(b) for a, b in zip(*symbols)} self.columns_mapping = dict( zip( X.columns, [ col.translate(tr).replace(" ", "_").replace("$", "dollar") for col in X.columns ], )) @tf.function() def inference(self, test_model_input): return self.model(test_model_input) def save_model(self): self.model.save_weights("backend/data/DeepFM_w.h5") with open("backend/data/DeepFM_data.pkl", "wb") as f_out: pickle.dump( ( self.columns_mapping, self.min_max_scaler, self.dense_features, self.fixlen_feature_columns, self.feature_names, ), f_out, ) def load_model(self): with open("data/DeepFM_data.pkl", "rb") as f_in: ( self.columns_mapping, self.min_max_scaler, self.dense_features, self.fixlen_feature_columns, self.feature_names, ) = pickle.load(f_in) self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) self.model.load_weights("data/DeepFM_w.h5")
print(model_input) # print(model_input.shape) # 4.Define Model,compile and train model = DeepFM( { "sparse": sparse_feat_list, "dense": dense_feat_list, "sequence": sequence_feature }, final_activation='linear', embedding_size=8, use_fm=False, hidden_size=(64, 64)) model.compile( "adam", "mape", metrics=['mape'], ) history = model.fit( model_input, df_train[target].values, batch_size=2048, epochs=200, verbose=2, validation_split=0.2, ) pred = model.predict(model_input) print(pred) print(smape(df_train[target].values, pred))
SparseFeat(feature, data[feature].nunique()) for feature in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name].values for name in feature_names} test_model_input = {name: test[name].values for name in feature_names} # 使用DeepFM进行训练 model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile( "adam", "mse", metrics=['mse'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, ) # 使用DeepFM进行预测 pred_ans = model.predict(test_model_input, batch_size=256) # 输出RMSE或MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse**0.5
# model = DeepFM(linear_feature_columns=linear_feature_columns, # dnn_feature_columns=dnn_feature_columns, # dnn_dropout=0.1, # dnn_hidden_units=(512, 128), # task='binary') model = DeepFM( linear_feature_columns, dnn_feature_columns, task='binary', dnn_dropout=0.1, dnn_hidden_units=(512, 128), ) model = multi_gpu_model(model, NUM_WORKERS) model.compile( optimizer=tf.keras.optimizers.Adam(3e-4), # loss="binary_crossentropy", loss=multi_category_focal_loss2(alpha=0.1), metrics=[auroc], ) dirpath = Path('checkpoint') if dirpath.exists() and dirpath.is_dir(): shutil.rmtree(dirpath) os.mkdir('checkpoint') hist = model.fit(online_train_model_input, train_df['label'].values, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_split=0.1, shuffle=True,
dataset = dataset.repeat(repeat_count) # Repeats dataset this # times return dataset linear_feature_columns = varlen_feature_columns + fixed_feature_columns dnn_feature_columns = varlen_feature_columns + fixed_feature_columns callbacks = [] GPU = True if GPU: strategy = tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']) # strategy = tf.distribute.MirroredStrategy(devices=['/gpu:3']) with strategy.scope(): model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=[1024, 512, 256], task='binary', dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False) model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', tf.keras.metrics.AUC()]) # model.run_eagerly = True model.fit_generator(generator=get_dataset(), steps_per_epoch=None, epochs=10, verbose=2, callbacks=callbacks, validation_data=get_dataset(eval_data_path), validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=100, workers=10, use_multiprocessing=False, shuffle=True, initial_epoch=0) tf.saved_model.save(model, "./models") else: model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=[1024, 512, 256], task='binary', dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False) model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', tf.keras.metrics.AUC()]) model.run_eagerly = True model.fit_generator(generator=get_dataset(), steps_per_epoch=None, epochs=10, verbose=2, callbacks=callbacks, validation_data=get_dataset(eval_data_path), validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=100, workers=10, use_multiprocessing=False, shuffle=True, initial_epoch=0)
] # print(fix_feature_columns) linear_feature_columns = fix_feature_columns dnn_feature_columns = fix_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 4. 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name].values for name in feature_names} test_model_input = {name: test[name].values for name in feature_names} # 5. 使用DeepFM进行训练 model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile('adam', 'mse', metrics=['mse']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2) # 6. 使用DeepFM进行预测 pred_ans = model.predict(test_model_input, batch_size=256) # 7. 输出RMSE或MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse**0.5 print('test RMSE', rmse)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} # print(type(train_model_input)) # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') # compile(self, optimizer, loss, metrics=None, loss_weights=None, sample_weight_mode=None, weighted_metrics=None, # target_tensors=None) model.compile("adam", "binary_crossentropy", metrics=['accuracy'] ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=50, verbose=2, validation_split=0.2) pred_ans = model.predict(test_model_input, batch_size=256) scores = [] for index in range(0, len(pred_ans)): result = llfun(test[target].values, pred_ans,index) scores.append(result) print(sum(scores) / len(scores)) # pred_ans = model.predict(test_model_input, batch_size=256) # # print(pred_ans) # # print(type(pred_ans)) #
l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.5, dnn_activation='relu', dnn_use_bn=True, task='binary') try: model.load_weights(checkpoint_path) print('load weights') except: pass model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy', 'AUC']) history = model.fit(train_model_input, train[target], batch_size=8192, epochs=5, verbose=1, shuffle=True, callbacks=[cp_callback], validation_data=(val_model_input, val[target])) data['predict'] = 0 data.loc[train_index, 'predict'] = model.predict(train_model_input, batch_size=8192) data.loc[val_index, 'predict'] = model.predict(val_model_input, batch_size=8192)
dnn_hidden_units=(128, 256), l2_reg_linear=0.01, l2_reg_embedding=0.01, init_std=0.0001, seed=1024, dnn_dropout=0.3, dnn_activation='selu', dnn_use_bn=True, ) import tensorflow.keras as keras import tensorflow as tf opt = tf.keras.optimizers.Adam(learning_rate=1e-5) model.compile( opt, "binary_crossentropy", metrics=['accuracy'], ) history = model.fit(train_model_input, train[target].values, batch_size=128, epochs=30, verbose=1, validation_data=(valid_model_input, valid[target].values)) def plot_learning_curves(history): pd.DataFrame(history.history).plot(figsize=(8, 5)) plt.grid(True) plt.gca().set_ylim(0, 1)
dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model = multi_gpu_model(model, gpus=2) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
train_model_input = [train[name] for name in feature_names] test_model_input = [test[name] for name in feature_names] #model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', use_fm=False, dnn_hidden_units=(128, 128), dnn_dropout=0) # model = DCN(dnn_feature_columns, embedding_size=8) model.compile( Adam(lr=0.005), "binary_crossentropy", metrics=['binary_crossentropy'], ) #es = EarlyStopping(monitor='val_binary_crossentropy') history = model.fit(train_model_input, train[target].values, validation_split=0.3, callbacks=[ EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto') ], batch_size=4096, epochs=10, verbose=1)