def pop(): """ :return: """ api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id predict_results.append(api2pop[api_id]) all_predict_results.append(predict_results) print('pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + 'pop' + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def MF(train_datas, test_datas, mode=''): all_predict_results = [] # 每个测试样例(多个api的)的评分 for slt_num in range(1, data_repository.get_args().slt_item_num + 1): # 不同个数的训练测试集 test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num - 1] # 增加处理和读取MF结果的接口 UV_obj = MF(data_repository.get_ds().data_root, mode, train_datas[slt_num - 1], slt_num) m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index for i in range(len(test_mashup_id_list)): test_mashup_id = test_mashup_id_list[i][0] # 每个mashup id predict_results = [] for test_api_id in test_api_id_list[i]: # id if test_mashup_id not in m_id2index or test_api_id not in a_id2index: dot = 0 else: m_embedding = UV_obj.m_embeddings[ m_id2index[test_mashup_id]] a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]] dot = np.dot(m_embedding, a_embedding) predict_results.append(dot) all_predict_results.append(predict_results) print('{}_{} test,done!'.format(mode, slt_num)) evaluate_result = evalute( test_api_id_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds().name + mode + str( slt_num) + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def recommend(self, topKs=[5]): """ 外部调用的推荐方法 :param cluster_threshold: :param topKs: NDCG@5 / 10 :return: """ self.update_embedding_paras() # 先更新embedding 参数,可以计算sim self.clustering() # 10个类 all_indicators = [] # 第一维是mashup;第二维是top5/10;第三维是每个指标 test_m_num = len(data_repository.get_ds().test_mashup_id_list) for i in range(test_m_num): a_mashup_indicators = [] test_m_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id query_feature = self.m_features[test_m_id] # 对每个测试的mashup/query self.cls_deduction(query_feature, a_wight=0.6, b_wight=0.15, topK=5) # 裁剪和挑选最近似cls 每个类选择几个 for topK in topKs: # NDCG@5 or NDCG@10 candidate_results = combinations([ self.deduct_cls[cls_index] for cls_index in self.sorted_cls_index[:topK] ]).get_results() # 选择topK个类进行组合 index2utity = {} for index in range(len(candidate_results)): index2utity[index] = self.score_set( candidate_results[index]) sorted_index = sorted( [i for i in range(len(candidate_results))], key=lambda index: index2utity[index], reverse=True) # 最终推荐的 max_k_candidates = candidate_results[ sorted_index[0]] # 这里只使用一个set? # print(max_k_candidates) # 调用evalute函数 a_mashup_indicators.append( list( evaluate(max_k_candidates, data_repository.get_ds().grounds[i], topK))) # 评价得到五个指标,K对NDCG等有用 all_indicators.append(a_mashup_indicators) all_indicators = np.average(all_indicators, axis=0) summary(new_Para.param.evaluate_path, 'set_rec_{}_clustingTs:{}_clsNum:{}'.format( self.cluster_method, self.cluster_ts, self.cls_num), all_indicators, topKs) # 名字中再加区别模型的参数
def hdp_pop(if_pop=True): # pop root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() # 测试 candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def binary_keyword(if_pop=False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( ) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim( mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 """
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF') candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score * api2pop[api_id]) else: predict_results.append(sim_score) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def train_best_NDCG_model(recommend_model, model, train_data, test_data, true_candidates_dict=None, CI_start_test_epoch=0, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] # 模型 train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model # 数据 train_instances_dict = recommend_model.get_instances( train_data, pairwise_train_phase_flag=data_repository.get_args().pairwise) train_labels = train_data.get('label') if data_repository.get_args( ).final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) best_epoch, best_NDCG_5 = 0, 0 for epoch in range(data_repository.get_args().num_epochs): if epoch == 0: # 首次训练要编译 # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy' # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy']) train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('whole_model compile,done!') print('Epoch {}'.format(epoch)) hist = train_model.fit( train_instances_dict, np.array(train_labels), batch_size=data_repository.get_args().batch_size, epochs=1, verbose=1, shuffle=True, validation_split=data_repository.get_args().validation_split) print('Epoch {}, train done!'.format(epoch)) # 记录:数据集情况,模型架构,训练设置 record_name = recommend_model.get_name() + data_repository.get_args( ).train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv save_loss_acc(hist, record_name, epoch=epoch) # 每个epoch记录 # CI的前3轮效果差,一般不用测,提高速度 first_test_epoch = CI_start_test_epoch if isinstance( recommend_model, CI_Model) else 0 if epoch < first_test_epoch: epoch_evaluate_results.append(None) continue # epoch测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, record_name, test_data, record_time=True if epoch == 0 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) # 优于目前的best_NDCG_5才存储模型参数 TODO if epoch_evaluate_result[0][3] >= best_NDCG_5: best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, epoch)) else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break # 记录最优epoch和最优NDCG@5 with open( data_repository.get_ds().new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( data_repository.get_ds().new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], data_repository.get_args().topKs) # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 把记录的非最优的epoch模型参数都删除 try: for i in range(data_repository.get_args().num_epochs): temp_path = data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model
def run_new_deepFM(CI_feas, NI_feas, train_data, test_data, all_api_num, epoch_num=10): # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # graph = tf.get_default_graph() # set_session(session) model = simple_DeepFM(CI_feature_num=4, NI_feature_num=2, CI_feature_dim=50, NI_feature_dim=25, final_feature_dim=32, task='binary', use_fm=True, l2_reg_linear=0, dnn_hidden_units=[]) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) print('bulid simple_DeepFM,done!') batch_size = 32 len_train = len(train_data[0]) mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = CI_feas mashup_NI_features, api_NI_features = NI_feas features = [ mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features, mashup_NI_features, api_NI_features ] train_generator = data_generator(train_data, *features, bs=batch_size, all_api_num=all_api_num, mode="train") print('genarate train_generator ,done!') # 每训练一次就测试一次 num_test_instances = getNum_testData(test_data) for i in range(epoch_num): history = model.fit_generator(train_generator, steps_per_epoch=len_train // batch_size, epochs=1, verbose=2) test_generator = data_generator(test_data, *features, bs=batch_size, all_api_num=all_api_num, mode="test") print('genarate test_generator,done!') predictions = model.predict_generator( test_generator, steps=num_test_instances // batch_size + 1)[:, 1] print(predictions.shape) reshaped_predictions = [] # 评价 test_api_id_list, grounds = test_data[1], test_data[-1] index = 0 for test_api_ids in test_api_id_list: size = len(test_api_ids) # 当前mashup下的候选api的数目 reshaped_predictions.append( predictions[index:index + size]) # min(index + size,len(predictions)) index += size print(index) evaluate_result = evalute(test_api_id_list, reshaped_predictions, grounds, new_Para.param.topKs) # 评价 summary(new_Para.param.evaluate_path, 'deepFM_epoch_{}'.format(i), evaluate_result, new_Para.param.topKs) #
def Samanta(topK, if_pop=2, MF_mode='node2vec', pop_mode='', text_mode='HDP', LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs( pop_mode) # TODO root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path = os.path.join( root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd = get_default_gd() _mashup_features, _api_features = gd.model_pcs(text_mode, LDA_topic_num) np.savetxt(mashup_feature_path, _mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features = np.loadtxt(mashup_feature_path) _api_features = np.loadtxt(api_feature_path) candidate_ids_list = [] all_predict_results = [] test_data = data_repository.get_ds().test_data test_mashup_num = len(test_data.get('mashup')) mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df api_emb_df = data_repository.get_ds().MF_obj.api_emb_df for i in range(test_mashup_num): test_m_id = test_data.get('mashup')[i][0] # 每个mashup id candidate_ids = test_data.get('api')[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 mid2sim = {} for train_m_id in mashup_emb_df.index.tolist(): mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id], _mashup_features[train_m_id]) # TODO topK_ids, topK_sims = zip(*( sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) # sim归一化 cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, )) for z in range(len(topK_ids)): cf_feature += topK_sims[z] * mashup_emb_df['embedding'][ topK_ids[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((data_repository.get_args().implict_feat_dim)) api_ids = set(api_emb_df.index.tolist()) for api_id in candidate_ids: # id api_i_feature = api_emb_df['embedding'][ api_id] if api_id in api_ids else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply( api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score = cos_sim(_mashup_features[test_m_id], _api_features[api_id]) # 特征的余弦相似度 if if_pop == 1: temp_predict_results.append((api_id, cf_score * sim_score)) elif if_pop == 0: predict_results.append(cf_score * sim_score) elif if_pop == 2: predict_results.append(cf_score * sim_score * api2pop[api_id]) if if_pop == 1: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 _name = '_pop_{}'.format(if_pop) _name += data_repository.get_args().mf_mode csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format( topK) + _name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(data_repository.get_ds().slt_api_ids_instances[i] ) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(data_repository.get_ds().test_data.get( 'all_ground_api_ids')[i]) return test_api_id_list_, predictions_, grounds_ if data_repository.get_args().data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, data_repository.get_args().topKs) summary(evaluate_path, str(slt_apiNum + 1) + '_' + csv_table_name, evaluate_result, data_repository.get_args().topKs) #