def bl_IsRec_best(a_dataset): model_name = 'IsRec_best' # 'IsRec' 'IsRec_best_modified' epoch_num = 20 neighbor_size = 15 topTopicNum = 3 cluster_mode = 'LDA' cluster_mode_topic_nums = [50] # 10,25,75,,100,125,150 train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) for cluster_mode_topic_num in cluster_mode_topic_nums: HINRec_model = HINRec_new( model_name=model_name, semantic_mode='TF_IDF', epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum, cluster_mode=cluster_mode, cluster_mode_topic_num=cluster_mode_topic_num) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch(HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=True) # )
def bl_PasRec(a_dataset): model_name = 'PasRec_2path' # 'PasRec_2path' epoch_num = 20 # 之前是40 40比20差点 neighbor_size = 15 topTopicNum = 3 train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) HINRec_model = HINRec_new(model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) # 使用LDA处理PasRec的相似度 50 100 150 # HINRec_model = HINRec_new(model_name=model_name, semantic_mode='LDA', LDA_topic_num=50, epoch_num=epoch_num, # neighbor_size=neighbor_size, # topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: # 这里是每隔20epoch测试一下,所以train中输入test_data HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch( HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=True) # ,if_save_recommend_result=True)
def bl_IsRec(a_dataset): model_name = 'IsRec' # '' epoch_nums = [20] # 15,100,1000 neighbor_size = 15 topTopicNums = [3] # [3,4,5,6] train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) for epoch_num in epoch_nums: for topTopicNum in topTopicNums: HINRec_model = HINRec_new(model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: HINRec_model.train(test_data) # HINRec_model.test_model(test_data) HINRec_model.save_model() evalute_by_epoch(HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=True ) # ,if_save_recommend_result=True)
def bl_DHSR(a_dataset): dhsr_recommend_model = DHSR_model() dhsr_model = dhsr_recommend_model.get_model() # a_dataset.transfer() # 将重复sample删除? 'newScene'且need_slt_apis=False时 train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) dhsr_model = load_preTrained_model( dhsr_recommend_model, dhsr_model, train_data, test_data, *new_Para.param.train_paras) # 'monitor loss&acc' dhsr_recommend_model.save_sth() evalute_by_epoch( dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name, test_data ) # ,if_save_recommend_result=True,evaluate_by_slt_apiNum = True)
def bl_DHSR_new(a_dataset): train_datas, test_datas = a_dataset.transfer_false_test_DHSR( if_reduct_train=True) # 是否约减训练集 # 选择的服务数目不同,训练对应的模型,并评估效果 for slt_num in range(1, new_Para.param.slt_item_num + 1): train_data, test_data = train_datas[slt_num - 1], test_datas[slt_num - 1] # old_new = 'new','new_sigmoid', 'new_reduct'效果最好 dhsr_recommend_model = DHSR_model(old_new='new_reduct', slt_num=slt_num) dhsr_model = dhsr_recommend_model.get_model() dhsr_model = load_preTrained_model( dhsr_recommend_model, dhsr_model, train_data, test_data, *new_Para.param.train_paras) # 'monitor loss&acc' evalute_by_epoch(dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name, test_data, evaluate_by_slt_apiNum=True) dhsr_recommend_model.save_sth() print('DHSR, slt_num:{}, train_predict,done!'.format(slt_num))
def DINRec(a_dataset, new_old='new'): train_data, test_data = a_dataset.train_data, a_dataset.test_data CI_recommend_model = CI_Model(new_old) # 'old' CI_recommend_model.prepare() CI_model_obj = CI_recommend_model.get_model() CI_model_obj = load_preTrained_model( CI_recommend_model, CI_model_obj, train_data, test_data, *new_Para.param.train_paras ) # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc' DINRec_model = DIN_Rec(CI_recommend_model, new_Para.param.predict_fc_unit_nums) DINRec_model.prepare() DINRec_model_obj = DINRec_model.get_model() DINRec_model_obj = load_preTrained_model( DINRec_model, DINRec_model_obj, train_data, test_data, *new_Para.param.train_paras ) # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc' evalute_by_epoch(DINRec_model, DINRec_model_obj, DINRec_model.simple_name, test_data, if_save_recommend_result=True, evaluate_by_slt_apiNum=True)
def train_save_by_early_stop(recommend_model, model, train_data, test_data): """ 训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试 :return: """ if_Train = True if new_Para.param.pairwise else False train_labels = train_data[-1] train_instances_tuple = recommend_model.get_instances( *train_data[:-1], test_phase_flag=if_Train) train_model = recommend_model.get_pairwise_model( ) if new_Para.param.pairwise else model if new_Para.param.pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min') hist = train_model.fit([*train_instances_tuple], train_labels, epochs=new_Para.param.num_epochs, batch_size=new_Para.param.small_batch_size, callbacks=[early_stopping], validation_split=new_Para.param.validation_split, shuffle=True) # model.save_weights( dataset.crt_ds.new_model_para_path.format(recommend_model.model_dir, 'min_loss')) # !!! 改正 model_name = recommend_model.get_simple_name() + recommend_model.get_name( ) + '_min_loss' save_loss_acc(hist, model_name, if_multi_epoch=True) epoch_evaluate_result = evalute_by_epoch(recommend_model, model, model_name, test_data) return model
def test_model(self, test_data): evalute_by_epoch(self, self, self.model_name, test_data)
def train_save_best_NDCG_model(recommend_model, model, train_data, test_data, start_epoch=0, true_candidates_dict=None, CI_start_test_epoch=2, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: ? :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] train_labels = train_data[-1] if new_Para.param.final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) # ??? 跟获取的实例有关 test_phase_flag = False if (new_Para.param.pairwise or new_Para.param.NI_OL_mode == 'OL_GE' or new_Para.param.train_mashup_best ) else True # 在线NI时,也要区分训练和测试过程得到隐式表示的方法 train_instances_tuple = recommend_model.get_instances( *train_data[:-1], test_phase_flag=test_phase_flag) # 有必要才获取负例:pairwise的训练 # 读取之前训练过的最优指标 if start_epoch > 0: with open( dataset.crt_ds.new_best_epoch_path.format( recommend_model.get_simple_name()), 'r') as f: best_epoch = int(f.read().strip()) with open( dataset.crt_ds.new_best_NDCG_path.format( recommend_model.get_simple_name()), 'r') as f: best_NDCG_5 = float(f.read().strip()) else: best_epoch, best_NDCG_5 = 0, 0 # 有必要转化为pairwise模型 train_model = recommend_model.get_pairwise_model( ) if new_Para.param.pairwise else model for epoch in range(new_Para.param.num_epochs - start_epoch): if start_epoch == 0 and epoch == 0: # 首次训练要编译 if new_Para.param.pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('model compile,done!') if start_epoch > 0: # 载入原模型,直接在原来的基础上训练 train_model = load_trained_model(recommend_model, model) epoch = epoch + start_epoch print('Epoch {}'.format(epoch)) # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model() # pairwise时需要复用相关参数!!! if type(train_instances_tuple) == tuple: hist = train_model.fit( [*train_instances_tuple], np.array(train_labels), batch_size=new_Para.param.batch_size, epochs=1, verbose=2, shuffle=True, validation_split=new_Para.param.validation_split ) #可以观察过拟合欠拟合 ,validation_split=0.1 else: hist = train_model.fit( train_instances_tuple, np.array(train_labels), batch_size=new_Para.param.batch_size, epochs=1, verbose=2, shuffle=True, validation_split=new_Para.param.validation_split) print('model train,done!') # 记录:数据集情况,模型新旧完整,模型架构,训练设置 # recommend_model.get_simple_name()+ '---'+ model_name = dataset.crt_ds.data_name + recommend_model.get_name( ) + new_Para.param.train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv # 每个epoch的测试 save_loss_acc(hist, model_name, epoch=epoch) if not os.path.exists(recommend_model.model_dir): os.makedirs(recommend_model.model_dir) if isinstance(recommend_model, CI_Model) and not isinstance( recommend_model, NI_Model_online): first_test_epoch = CI_start_test_epoch # 前3轮效果差,一般不用测 else: first_test_epoch = 0 if epoch < first_test_epoch: # 暂不测试,提高速度 epoch_evaluate_results.append(None) continue if epoch == first_test_epoch: # 记录第一个epoch的测试时间 with open(new_Para.param.time_path, 'a+') as f1: f1.write(recommend_model.get_simple_name()) f1.write('\n') # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model() # pairwise时需要复用相关参数!!! # 没必要使用get_model再获取,传入的model是对象引用,pairwise更新后model也变化 # 每个epoch的测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, model_name, test_data, record_time=True if epoch == 1 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) if epoch_evaluate_result[0][3] >= best_NDCG_5: # 优于目前的best_NDCG_5才存储 best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, epoch)) # 记录该epoch下的模型参数*** else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break #@@@# # 第一个 epoch之后存储HIN_sim对象??? 删去only_MLP_model的判断,换成了CI? NI为什么要记录?or isinstance(recommend_model,NI_Model) # if epoch==0 and (isinstance(recommend_model,gx_text_tag_continue_only_MLP_model) ): # recommend_model.save_HIN_sim() # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 记录最优epoch和最优NDCG@5 with open( dataset.crt_ds.new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( dataset.crt_ds.new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' # 命名格式!!! summary(new_Para.param.evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], new_Para.param.topKs) # 把记录的非最优的epoch模型参数都删除 try: for i in range(new_Para.param.num_epochs): temp_path = dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights( dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model