def binary_keyword(if_pop = False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v () # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 """
def get_p4_sim(self, m_id2, mashup_slt_apis_list,aTagSem=None,api_tags = None): # p4:SBS-service-category-service-SBS # aTagSem: api的tag的模式 'MetaPath'或者其他形式 'Deep','HDP','TF_IDF'等 if aTagSem is None: raise ValueError('must feed Para."aTagSem"!') if aTagSem == 'MetaPath': if self.p4_sims is None: self.p4_sims = self.load_sims('p4_sims_{}.dat'.format(aTagSem)) self_sim_dict = self.p4_sims else: # 其他形式的tag特征 if self.p4_sims_sem is None: self.p4_sims_sem = self.load_sims('p4_sims_sem{}.dat'.format(aTagSem)) self_sim_dict = self.p4_sims_sem _key = (m_id2, tuple(mashup_slt_apis_list)) if _key not in self_sim_dict.keys (): if aTagSem == 'MetaPath': self.flag4 = True if api_tags is None: # 用户不传入则使用默认 api_tags = self.unpadded_encoded_api_tags m1_api_category = {a_id: set (api_tags[a_id]) for a_id in mashup_slt_apis_list} m2_api_category = {a_id: set (api_tags[a_id]) for a_id in self.mashup_apis[m_id2]} p4_sim = cpt_p46_sim (m1_api_category, m2_api_category) else: # 使用传入的tag feature+最大化集合 self.flag4_sem = True tag_sim_matrix = [[cos_sim(self.api_tag_features[a_id1],self.api_tag_features[a_id2]) for a_id1 in mashup_slt_apis_list] for a_id2 in self.mashup_apis[m_id2]] p4_sim = cpt_2list_sim(tag_sim_matrix) self_sim_dict[_key] = p4_sim else: p4_sim = self_sim_dict[_key] return p4_sim
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs ('TF_IDF') candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_IFIDF_features[test_mashup_id],_api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score*api2pop[api_id]) else: predict_results.append(sim_score ) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def hdp_pop(if_pop = True): # pop root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path=os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt (mashup_hdp_path) _api_hdp_features = np.loadtxt (api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () # 测试 candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_hdp_features[test_mashup_id],_api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def get_mean(self, mashup_text_index, api_text_index): embedding1 = np.array([ self.wordindex2embedding.get(word) for word in mashup_text_index ]).mean(axis=0) embedding2 = np.array([ self.wordindex2embedding.get(word) for word in api_text_index ]).mean(axis=0) sim = cos_sim(embedding1, embedding2) return sim
def cpt_mashup_sim(self,m_id1,m_id2,mode='feature_cosine'): if m_id1==m_id2: return 0 if mode=='feature_cosine': if self.m_feature_cosin_sim[m_id1][m_id2]<0: sim = cos_sim(self.m_text_features[m_id1],self.m_text_features[m_id2]) self.m_feature_cosin_sim[m_id1][m_id2] = sim self.m_feature_cosin_sim[m_id2][m_id1] = sim else: sim = self.m_feature_cosin_sim[m_id1][m_id2] return sim
def cpt_sim(self, fea1, fea2): # 'w','HDP','DL' """ 基于学到的W计算文本相似度:每个特征默认是行向量 :param m_id: :param a_id: :return: """ if self.embedding_mode == 'W': return (fea1.dot(self.W)).dot( fea2) # @注意! 一维向量.dot就是点乘;想得到二维乘积自己处理(先升维再dot太耗时 else: return cos_sim(fea1, fea2) # 最一般的余弦相似度
def get_p2_sim_sem(self, min_m_id, max_m_id, mTextTrueSem=None): # p2:SBS-content-SBS # 基于feature+cosine 使用各种特征提取方式 可以是'Deep','HDP','TF_IDF'等 if mTextTrueSem is not None: if self.p2_sims_sem is None: self.p2_sims_sem = self.load_sims('p2_sims_sem{}.dat'.format(mTextTrueSem)) if (min_m_id, max_m_id) not in self.p2_sims_sem.keys(): # print('min_m_id, max_m_id,not in p2 sim!') self.flag2_sem = True # 利用外部传入的text feature,应该和参数名一致 p2_sim = cos_sim(self.mashup_texts_features[min_m_id], self.mashup_texts_features[max_m_id]) self.p2_sims_sem[(min_m_id, max_m_id)] = p2_sim else: p2_sim = self.p2_sims_sem[(min_m_id, max_m_id)] return p2_sim
def cpt_wod_cos_sim(self, id1, id2): """ 计算词(id)间的sim,并存储供索引 :param id1: :param id2: :return: """ if id1 == id2: return 1 id_b = max(id1, id2) id_s = min(id1, id2) value = self.words_Sim.get((id_s, id_b)) # 小到大,按顺序 if value is None: value = cos_sim(self.wordindex2embedding.get(id_s), self.wordindex2embedding.get(id_b)) self.words_Sim[(id_s, id_b)] = value return value
def get_word_cos_sim(self, id1, id2): """ 计算词(id)间的sim,并存储供索引 :param id1: :param id2: :return: """ if id1==0 or id2==0: # 是padding用的0 index时,返回索引 return 0 if id1 == id2: return 1 id_b = max (id1, id2) id_s = min (id1, id2) value = self.words_Sim.get ((id_s, id_b)) # 小到大,按顺序 if value is None: value = cos_sim (self.wordindex2embedding[id_s], self.wordindex2embedding[id_b]) self.words_Sim[(id_s, id_b)] = value return value
def get_p1_sim_sem(self, min_m_id, max_m_id, mTagTrueSem=None): """ # p1:SBS-category-SBS 语义形式 :param min_m_id: :param max_m_id: :param mTagTrueSem: tag的“语义形式”:使用各种特征提取方式 可以是'Deep','HDP','TF_IDF'等等 :return: """ if mTagTrueSem is not None: if self.p1_sims_sem is None: self.p1_sims_sem = self.load_sims('p1_sims_sem{}.dat'.format(mTagTrueSem)) if (min_m_id, max_m_id) not in self.p1_sims_sem.keys(): # print(min_m_id, max_m_id,'not in p1 sim!') self.flag1_sem = True p1_sim = cos_sim(self.mashup_tag_features[min_m_id], self.mashup_tag_features[max_m_id]) # 利用外部传入的tag feature,应该和参数名一致 self.p1_sims_sem[(min_m_id, max_m_id)] = p1_sim else: p1_sim = self.p1_sims_sem[(min_m_id, max_m_id)] return p1_sim
def Samanta(topK, if_pop=2, MF_mode='node2vec', pop_mode='', text_mode='HDP', LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs( pop_mode) # TODO root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path = os.path.join( root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd = get_default_gd() _mashup_features, _api_features = gd.model_pcs(text_mode, LDA_topic_num) np.savetxt(mashup_feature_path, _mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features = np.loadtxt(mashup_feature_path) _api_features = np.loadtxt(api_feature_path) candidate_ids_list = [] all_predict_results = [] test_data = data_repository.get_ds().test_data test_mashup_num = len(test_data.get('mashup')) mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df api_emb_df = data_repository.get_ds().MF_obj.api_emb_df for i in range(test_mashup_num): test_m_id = test_data.get('mashup')[i][0] # 每个mashup id candidate_ids = test_data.get('api')[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 mid2sim = {} for train_m_id in mashup_emb_df.index.tolist(): mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id], _mashup_features[train_m_id]) # TODO topK_ids, topK_sims = zip(*( sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) # sim归一化 cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, )) for z in range(len(topK_ids)): cf_feature += topK_sims[z] * mashup_emb_df['embedding'][ topK_ids[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((data_repository.get_args().implict_feat_dim)) api_ids = set(api_emb_df.index.tolist()) for api_id in candidate_ids: # id api_i_feature = api_emb_df['embedding'][ api_id] if api_id in api_ids else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply( api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score = cos_sim(_mashup_features[test_m_id], _api_features[api_id]) # 特征的余弦相似度 if if_pop == 1: temp_predict_results.append((api_id, cf_score * sim_score)) elif if_pop == 0: predict_results.append(cf_score * sim_score) elif if_pop == 2: predict_results.append(cf_score * sim_score * api2pop[api_id]) if if_pop == 1: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 _name = '_pop_{}'.format(if_pop) _name += data_repository.get_args().mf_mode csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format( topK) + _name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(data_repository.get_ds().slt_api_ids_instances[i] ) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(data_repository.get_ds().test_data.get( 'all_ground_api_ids')[i]) return test_api_id_list_, predictions_, grounds_ if data_repository.get_args().data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, data_repository.get_args().topKs) summary(evaluate_path, str(slt_apiNum + 1) + '_' + csv_table_name, evaluate_result, data_repository.get_args().topKs) #
def Samanta(topK,if_pop=2,MF_mode='node2vec',pop_mode='',text_mode='HDP',LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop=None if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs (pop_mode) root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path=os.path.join(root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd=get_default_gd() _mashup_features,_api_features=gd.model_pcs(text_mode,LDA_topic_num) np.savetxt(mashup_feature_path,_mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features=np.loadtxt(mashup_feature_path) _api_features=np.loadtxt(api_feature_path) # Para.set_MF_mode(MF_mode) # 设置latent factor # new_Para.param.mf_mode = MF_mode # 修改参数对象,慎用 candidate_ids_list = [] all_predict_results=[] test_mashup_num = len(dataset.crt_ds.test_mashup_id_list) for i in range(test_mashup_num): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 localIndex2sim={} for local_index,train_m_id in enumerate(dataset.UV_obj.m_ids): # u_factors_matrix要用局部索引 localIndex2sim[local_index]=cos_sim(_mashup_features[test_mashup_id],_mashup_features[train_m_id]) topK_indexes,topK_sims=zip(*(sorted(localIndex2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims=np.array(topK_sims)/sum(topK_sims) # sim归一化 cf_feature=np.zeros((new_Para.param.num_feat,)) for z in range(len(topK_indexes)): cf_feature+= topK_sims[z] * dataset.UV_obj.m_embeddings[topK_indexes[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results=[] # 需要用pop进行重排序时的辅助 api_zeros=np.zeros((new_Para.param.num_feat)) for api_id in candidate_ids: # id a_id2index = dataset.UV_obj.a_id2index api_i_feature= dataset.UV_obj.a_embeddings[a_id2index[api_id]] if api_id in a_id2index.keys() else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score=np.sum(np.multiply(api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score=cos_sim(_mashup_features[test_mashup_id],_api_features[api_id]) # 特征的余弦相似度 if if_pop==1: temp_predict_results.append((api_id,cf_score*sim_score)) elif if_pop==0: predict_results.append(cf_score*sim_score) elif if_pop == 2: predict_results.append (cf_score * sim_score*api2pop[api_id]) if if_pop==1: max_k_pairs = heapq.nlargest (topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip (*max_k_pairs) max_k_candidates=set(max_k_candidates) predict_results=[api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 _name='_pop_{}'.format(if_pop) _name+= new_Para.param.mf_mode csv_table_name = dataset.crt_ds.data_name + 'Samanta_model_{}'.format(topK)+_name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(dataset.crt_ds.slt_api_ids_instances[i]) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(dataset.crt_ds.grounds[i]) return test_api_id_list_, predictions_, grounds_ if new_Para.param.data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum+1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, new_Para.param.topKs) summary(new_Para.param.evaluate_path, str(slt_apiNum+1)+'_'+csv_table_name, evaluate_result, new_Para.param.topKs) #