def binary_keyword(if_pop = False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v () # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 """
def binary_keyword(text_tag_recommend_model): # pop all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( all_mashup_num, all_api_num) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WVSM' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录 # 测试WJaccard(Weighted Jaccard) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id mashup_set = set(mashup_words_list[test_mashup_id]) api_set = set(api_words_list[api_id]) sim_score = 1.0 * len(mashup_set.intersection(api_set)) / len( mashup_set.union(api_set)) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WJaccard test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WJaccard' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs ('TF_IDF') candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_IFIDF_features[test_mashup_id],_api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score*api2pop[api_id]) else: predict_results.append(sim_score ) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def hdp_pop(if_pop = True): # pop root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path=os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt (mashup_hdp_path) _api_hdp_features = np.loadtxt (api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () # 测试 candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_hdp_features[test_mashup_id],_api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def recommend(self, topKs=[5]): """ 外部调用的推荐方法 :param cluster_threshold: :param topKs: NDCG@5 / 10 :return: """ self.update_embedding_paras() # 先更新embedding 参数,可以计算sim self.clustering() # 10个类 all_indicators = [] # 第一维是mashup;第二维是top5/10;第三维是每个指标 test_m_num = len(dataset.crt_ds.test_mashup_id_list) for i in range(test_m_num): a_mashup_indicators = [] test_m_id = dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id query_feature = self.m_features[test_m_id] # 对每个测试的mashup/query self.cls_deduction(query_feature, a_wight=0.6, b_wight=0.15, topK=5) # 裁剪和挑选最近似cls 每个类选择几个 for topK in topKs: # NDCG@5 or NDCG@10 candidate_results = combinations([ self.deduct_cls[cls_index] for cls_index in self.sorted_cls_index[:topK] ]).get_results() # 选择topK个类进行组合 index2utity = {} for index in range(len(candidate_results)): index2utity[index] = self.score_set( candidate_results[index]) sorted_index = sorted( [i for i in range(len(candidate_results))], key=lambda index: index2utity[index], reverse=True) # 最终推荐的 max_k_candidates = candidate_results[ sorted_index[0]] # 这里只使用一个set? # print(max_k_candidates) # 调用evalute函数 a_mashup_indicators.append( list( evaluate(max_k_candidates, dataset.crt_ds.grounds[i], topK))) # 评价得到五个指标,K对NDCG等有用 all_indicators.append(a_mashup_indicators) all_indicators = np.average(all_indicators, axis=0) summary(new_Para.param.evaluate_path, 'set_rec_{}_clustingTs:{}_clsNum:{}'.format( self.cluster_method, self.cluster_ts, self.cls_num), all_indicators, topKs) # 名字中再加区别模型的参数
def pop(): """ :return: """ api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id predict_results.append(api2pop[api_id]) all_predict_results.append(predict_results) print('pop test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + 'pop' + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def TF_IDF(text_tag_recommend_model): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs( 'TF_IDF', all_mashup_num, all_api_num) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'TF_IDF' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def MF(train_datas,test_datas,mode = ''): all_predict_results=[] # 每个测试样例(多个api的)的评分 for slt_num in range(1,new_Para.param.slt_item_num+1): # 不同个数的训练测试集 test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num-1] # 增加处理和读取MF结果的接口 UV_obj = get_UV(dataset.crt_ds.root_path, mode,train_datas[slt_num-1],slt_num) m_id2index,a_id2index = UV_obj.m_id2index,UV_obj.a_id2index for i in range(len(test_mashup_id_list)): test_mashup_id=test_mashup_id_list[i][0] # 每个mashup id predict_results = [] for test_api_id in test_api_id_list[i]: # id if test_mashup_id not in m_id2index or test_api_id not in a_id2index: dot = 0 else: m_embedding = UV_obj.m_embeddings[m_id2index[test_mashup_id]] a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]] dot = np.dot(m_embedding,a_embedding) predict_results.append(dot) all_predict_results.append(predict_results) print('{}_{} test,done!'.format(mode,slt_num)) evaluate_result = evalute(test_api_id_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + mode + str(slt_num)+ "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def train_save_best_NDCG_model(recommend_model, model, train_data, test_data, start_epoch=0, true_candidates_dict=None, CI_start_test_epoch=2, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: ? :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] train_labels = train_data[-1] if new_Para.param.final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) # ??? 跟获取的实例有关 test_phase_flag = False if (new_Para.param.pairwise or new_Para.param.NI_OL_mode == 'OL_GE' or new_Para.param.train_mashup_best ) else True # 在线NI时,也要区分训练和测试过程得到隐式表示的方法 train_instances_tuple = recommend_model.get_instances( *train_data[:-1], test_phase_flag=test_phase_flag) # 有必要才获取负例:pairwise的训练 # 读取之前训练过的最优指标 if start_epoch > 0: with open( dataset.crt_ds.new_best_epoch_path.format( recommend_model.get_simple_name()), 'r') as f: best_epoch = int(f.read().strip()) with open( dataset.crt_ds.new_best_NDCG_path.format( recommend_model.get_simple_name()), 'r') as f: best_NDCG_5 = float(f.read().strip()) else: best_epoch, best_NDCG_5 = 0, 0 # 有必要转化为pairwise模型 train_model = recommend_model.get_pairwise_model( ) if new_Para.param.pairwise else model for epoch in range(new_Para.param.num_epochs - start_epoch): if start_epoch == 0 and epoch == 0: # 首次训练要编译 if new_Para.param.pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('model compile,done!') if start_epoch > 0: # 载入原模型,直接在原来的基础上训练 train_model = load_trained_model(recommend_model, model) epoch = epoch + start_epoch print('Epoch {}'.format(epoch)) # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model() # pairwise时需要复用相关参数!!! if type(train_instances_tuple) == tuple: hist = train_model.fit( [*train_instances_tuple], np.array(train_labels), batch_size=new_Para.param.batch_size, epochs=1, verbose=2, shuffle=True, validation_split=new_Para.param.validation_split ) #可以观察过拟合欠拟合 ,validation_split=0.1 else: hist = train_model.fit( train_instances_tuple, np.array(train_labels), batch_size=new_Para.param.batch_size, epochs=1, verbose=2, shuffle=True, validation_split=new_Para.param.validation_split) print('model train,done!') # 记录:数据集情况,模型新旧完整,模型架构,训练设置 # recommend_model.get_simple_name()+ '---'+ model_name = dataset.crt_ds.data_name + recommend_model.get_name( ) + new_Para.param.train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv # 每个epoch的测试 save_loss_acc(hist, model_name, epoch=epoch) if not os.path.exists(recommend_model.model_dir): os.makedirs(recommend_model.model_dir) if isinstance(recommend_model, CI_Model) and not isinstance( recommend_model, NI_Model_online): first_test_epoch = CI_start_test_epoch # 前3轮效果差,一般不用测 else: first_test_epoch = 0 if epoch < first_test_epoch: # 暂不测试,提高速度 epoch_evaluate_results.append(None) continue if epoch == first_test_epoch: # 记录第一个epoch的测试时间 with open(new_Para.param.time_path, 'a+') as f1: f1.write(recommend_model.get_simple_name()) f1.write('\n') # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model() # pairwise时需要复用相关参数!!! # 没必要使用get_model再获取,传入的model是对象引用,pairwise更新后model也变化 # 每个epoch的测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, model_name, test_data, record_time=True if epoch == 1 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) if epoch_evaluate_result[0][3] >= best_NDCG_5: # 优于目前的best_NDCG_5才存储 best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, epoch)) # 记录该epoch下的模型参数*** else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break #@@@# # 第一个 epoch之后存储HIN_sim对象??? 删去only_MLP_model的判断,换成了CI? NI为什么要记录?or isinstance(recommend_model,NI_Model) # if epoch==0 and (isinstance(recommend_model,gx_text_tag_continue_only_MLP_model) ): # recommend_model.save_HIN_sim() # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 记录最优epoch和最优NDCG@5 with open( dataset.crt_ds.new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( dataset.crt_ds.new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' # 命名格式!!! summary(new_Para.param.evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], new_Para.param.topKs) # 把记录的非最优的epoch模型参数都删除 try: for i in range(new_Para.param.num_epochs): temp_path = dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights( dataset.crt_ds.new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model
def run_new_deepFM(CI_feas, NI_feas, train_data, test_data, all_api_num, epoch_num=10): # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # graph = tf.get_default_graph() # set_session(session) model = simple_DeepFM(CI_feature_num=4, NI_feature_num=2, CI_feature_dim=50, NI_feature_dim=25, final_feature_dim=32, task='binary', use_fm=True, l2_reg_linear=0, dnn_hidden_units=[]) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) print('bulid simple_DeepFM,done!') batch_size = 32 len_train = len(train_data[0]) mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = CI_feas mashup_NI_features, api_NI_features = NI_feas features = [ mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features, mashup_NI_features, api_NI_features ] train_generator = data_generator(train_data, *features, bs=batch_size, all_api_num=all_api_num, mode="train") print('genarate train_generator ,done!') # 每训练一次就测试一次 num_test_instances = getNum_testData(test_data) for i in range(epoch_num): history = model.fit_generator(train_generator, steps_per_epoch=len_train // batch_size, epochs=1, verbose=2) test_generator = data_generator(test_data, *features, bs=batch_size, all_api_num=all_api_num, mode="test") print('genarate test_generator,done!') predictions = model.predict_generator( test_generator, steps=num_test_instances // batch_size + 1)[:, 1] print(predictions.shape) reshaped_predictions = [] # 评价 test_api_id_list, grounds = test_data[1], test_data[-1] index = 0 for test_api_ids in test_api_id_list: size = len(test_api_ids) # 当前mashup下的候选api的数目 reshaped_predictions.append( predictions[index:index + size]) # min(index + size,len(predictions)) index += size print(index) evaluate_result = evalute(test_api_id_list, reshaped_predictions, grounds, new_Para.param.topKs) # 评价 summary(new_Para.param.evaluate_path, 'deepFM_epoch_{}'.format(i), evaluate_result, new_Para.param.topKs) #
def Samanta(text_tag_recommend_model, topK, if_pop=False): """ :param Para: :param text_tag_recommend_model: 基于该model的基本数据 :param topK: :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() test_mashup_num = len(Para.test_mashup_id_list) all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) mashup_hdp_path = os.path.join(Para.data_dir, 'mashup_hdp.txt') api_hdp_path = os.path.join(Para.data_dir, 'api_hdp.txt') # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_hdp_path): # text,tag在encoding之后的向量,array形式 gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_hdp_features, _api_hdp_features = gd.model_pcs( 'HDP', all_mashup_num, all_api_num) np.savetxt(mashup_hdp_path, _mashup_hdp_features) np.savetxt(api_hdp_path, _api_hdp_features) else: _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) candidate_ids_list = [] all_predict_results = [] for i in range(test_mashup_num): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) id2sim = {} for local_train_mashup_index in range( len(Para.feature_train_mashup_ids)): # u_factors_matrix要用局部索引 id2sim[local_train_mashup_index] = cos_sim( _mashup_hdp_features[test_mashup_id], _mashup_hdp_features[ Para.feature_train_mashup_ids[local_train_mashup_index]]) topK_indexes, topK_sims = zip( *(sorted(id2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) cf_feature = np.zeros((Para.num_feat)) for z in range(len(topK_indexes)): cf_feature += topK_sims[z] * Para.u_factors_matrix[topK_indexes[z]] predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((Para.num_feat)) for api_id in candidate_ids: # id api_i_feature = Para.i_factors_matrix[ Para.i_id2index[api_id]] if api_id in Para.i_id2index.keys( ) else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply(api_i_feature, cf_feature)) sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: temp_predict_results.append((api_id, cf_score * sim_score)) else: predict_results.append(cf_score * sim_score) if if_pop: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 根据score选取topK max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 _name = '_pop' if if_pop else '' csv_table_name = Para.data_name + 'Samanta_model' + _name + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def Samanta(topK,if_pop=2,MF_mode='node2vec',pop_mode='',text_mode='HDP',LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop=None if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs (pop_mode) root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path=os.path.join(root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd=get_default_gd() _mashup_features,_api_features=gd.model_pcs(text_mode,LDA_topic_num) np.savetxt(mashup_feature_path,_mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features=np.loadtxt(mashup_feature_path) _api_features=np.loadtxt(api_feature_path) # Para.set_MF_mode(MF_mode) # 设置latent factor # new_Para.param.mf_mode = MF_mode # 修改参数对象,慎用 candidate_ids_list = [] all_predict_results=[] test_mashup_num = len(dataset.crt_ds.test_mashup_id_list) for i in range(test_mashup_num): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 localIndex2sim={} for local_index,train_m_id in enumerate(dataset.UV_obj.m_ids): # u_factors_matrix要用局部索引 localIndex2sim[local_index]=cos_sim(_mashup_features[test_mashup_id],_mashup_features[train_m_id]) topK_indexes,topK_sims=zip(*(sorted(localIndex2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims=np.array(topK_sims)/sum(topK_sims) # sim归一化 cf_feature=np.zeros((new_Para.param.num_feat,)) for z in range(len(topK_indexes)): cf_feature+= topK_sims[z] * dataset.UV_obj.m_embeddings[topK_indexes[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results=[] # 需要用pop进行重排序时的辅助 api_zeros=np.zeros((new_Para.param.num_feat)) for api_id in candidate_ids: # id a_id2index = dataset.UV_obj.a_id2index api_i_feature= dataset.UV_obj.a_embeddings[a_id2index[api_id]] if api_id in a_id2index.keys() else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score=np.sum(np.multiply(api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score=cos_sim(_mashup_features[test_mashup_id],_api_features[api_id]) # 特征的余弦相似度 if if_pop==1: temp_predict_results.append((api_id,cf_score*sim_score)) elif if_pop==0: predict_results.append(cf_score*sim_score) elif if_pop == 2: predict_results.append (cf_score * sim_score*api2pop[api_id]) if if_pop==1: max_k_pairs = heapq.nlargest (topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip (*max_k_pairs) max_k_candidates=set(max_k_candidates) predict_results=[api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 _name='_pop_{}'.format(if_pop) _name+= new_Para.param.mf_mode csv_table_name = dataset.crt_ds.data_name + 'Samanta_model_{}'.format(topK)+_name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(dataset.crt_ds.slt_api_ids_instances[i]) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(dataset.crt_ds.grounds[i]) return test_api_id_list_, predictions_, grounds_ if new_Para.param.data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum+1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, new_Para.param.topKs) summary(new_Para.param.evaluate_path, str(slt_apiNum+1)+'_'+csv_table_name, evaluate_result, new_Para.param.topKs) #