Пример #1
0
def binary_keyword(if_pop = False):
    # pop
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs ()
    gd = get_default_gd()
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v ()


    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            if if_pop:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id]
            else:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    name = 'WVSM_pop' if if_pop else 'WVSM'
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录

    """
Пример #2
0
    def get_p4_sim(self, m_id2, mashup_slt_apis_list,aTagSem=None,api_tags = None):
        # p4:SBS-service-category-service-SBS
        # aTagSem: api的tag的模式  'MetaPath'或者其他形式 'Deep','HDP','TF_IDF'等
        if aTagSem is None:
            raise ValueError('must feed Para."aTagSem"!')

        if aTagSem == 'MetaPath':
            if self.p4_sims is None:
                self.p4_sims = self.load_sims('p4_sims_{}.dat'.format(aTagSem))
            self_sim_dict = self.p4_sims
        else: # 其他形式的tag特征
            if self.p4_sims_sem is None:
                self.p4_sims_sem = self.load_sims('p4_sims_sem{}.dat'.format(aTagSem))
            self_sim_dict = self.p4_sims_sem

        _key = (m_id2, tuple(mashup_slt_apis_list))
        if _key not in self_sim_dict.keys ():
            if aTagSem == 'MetaPath':
                self.flag4 = True
                if api_tags is None: # 用户不传入则使用默认
                    api_tags = self.unpadded_encoded_api_tags

                m1_api_category = {a_id: set (api_tags[a_id]) for a_id in mashup_slt_apis_list}
                m2_api_category = {a_id: set (api_tags[a_id]) for a_id in self.mashup_apis[m_id2]}
                p4_sim = cpt_p46_sim (m1_api_category, m2_api_category)
            else: # 使用传入的tag feature+最大化集合
                self.flag4_sem = True
                tag_sim_matrix = [[cos_sim(self.api_tag_features[a_id1],self.api_tag_features[a_id2])
                                   for a_id1 in mashup_slt_apis_list] for a_id2 in self.mashup_apis[m_id2]]
                p4_sim = cpt_2list_sim(tag_sim_matrix)
            self_sim_dict[_key] = p4_sim
        else:
            p4_sim = self_sim_dict[_key]
        return p4_sim
Пример #3
0
def TF_IDF(if_pop):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    gd = get_default_gd()
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs ('TF_IDF')

    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            sim_score=cos_sim(_mashup_IFIDF_features[test_mashup_id],_api_IFIDF_features[api_id])
            if if_pop:
                predict_results.append(sim_score*api2pop[api_id])
            else:
                predict_results.append(sim_score )
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    name = 'TFIDF_pop' if if_pop else 'TFIDF'
    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #4
0
def hdp_pop(if_pop = True):
    # pop
    root = os.path.join(dataset.crt_ds.root_path,'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_hdp_path=os.path.join(root, 'mashup_HDP.txt') # ...
    api_hdp_path = os.path.join(root, 'api_HDP.txt')

    _mashup_hdp_features = np.loadtxt (mashup_hdp_path)
    _api_hdp_features = np.loadtxt (api_hdp_path)

    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs ()
    # 测试
    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            sim_score=cos_sim(_mashup_hdp_features[test_mashup_id],_api_hdp_features[api_id])
            if if_pop:
                sim_score *= api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('hdp_pop test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    name = 'hdp_pop' if if_pop else 'hdp'
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #5
0
 def get_mean(self, mashup_text_index, api_text_index):
     embedding1 = np.array([
         self.wordindex2embedding.get(word) for word in mashup_text_index
     ]).mean(axis=0)
     embedding2 = np.array([
         self.wordindex2embedding.get(word) for word in api_text_index
     ]).mean(axis=0)
     sim = cos_sim(embedding1, embedding2)
     return sim
Пример #6
0
 def cpt_mashup_sim(self,m_id1,m_id2,mode='feature_cosine'):
     if m_id1==m_id2:
         return 0
     if mode=='feature_cosine':
         if  self.m_feature_cosin_sim[m_id1][m_id2]<0:
             sim = cos_sim(self.m_text_features[m_id1],self.m_text_features[m_id2])
             self.m_feature_cosin_sim[m_id1][m_id2] = sim
             self.m_feature_cosin_sim[m_id2][m_id1] = sim
         else:
             sim = self.m_feature_cosin_sim[m_id1][m_id2]
         return sim
Пример #7
0
 def cpt_sim(self, fea1, fea2):  # 'w','HDP','DL'
     """
     基于学到的W计算文本相似度:每个特征默认是行向量
     :param m_id:
     :param a_id:
     :return:
     """
     if self.embedding_mode == 'W':
         return (fea1.dot(self.W)).dot(
             fea2)  # @注意!  一维向量.dot就是点乘;想得到二维乘积自己处理(先升维再dot太耗时
     else:
         return cos_sim(fea1, fea2)  # 最一般的余弦相似度
Пример #8
0
    def get_p2_sim_sem(self, min_m_id, max_m_id, mTextTrueSem=None):
        # p2:SBS-content-SBS
        # 基于feature+cosine 使用各种特征提取方式 可以是'Deep','HDP','TF_IDF'等
        if mTextTrueSem is not None:
            if self.p2_sims_sem is None:
                self.p2_sims_sem = self.load_sims('p2_sims_sem{}.dat'.format(mTextTrueSem))

            if (min_m_id, max_m_id) not in self.p2_sims_sem.keys():
                # print('min_m_id, max_m_id,not in p2 sim!')
                self.flag2_sem = True
                # 利用外部传入的text feature,应该和参数名一致
                p2_sim = cos_sim(self.mashup_texts_features[min_m_id], self.mashup_texts_features[max_m_id])
                self.p2_sims_sem[(min_m_id, max_m_id)] = p2_sim
            else:
                p2_sim = self.p2_sims_sem[(min_m_id, max_m_id)]
            return p2_sim
Пример #9
0
    def cpt_wod_cos_sim(self, id1, id2):
        """
        计算词(id)间的sim,并存储供索引
        :param id1:
        :param id2:
        :return:
        """

        if id1 == id2:
            return 1
        id_b = max(id1, id2)
        id_s = min(id1, id2)
        value = self.words_Sim.get((id_s, id_b))  # 小到大,按顺序
        if value is None:
            value = cos_sim(self.wordindex2embedding.get(id_s),
                            self.wordindex2embedding.get(id_b))
            self.words_Sim[(id_s, id_b)] = value
        return value
Пример #10
0
 def get_word_cos_sim(self, id1, id2):
     """
     计算词(id)间的sim,并存储供索引
     :param id1:
     :param id2:
     :return:
     """
     if id1==0 or id2==0: # 是padding用的0 index时,返回索引
         return 0
     if id1 == id2:
         return 1
     id_b = max (id1, id2)
     id_s = min (id1, id2)
     value = self.words_Sim.get ((id_s, id_b))  # 小到大,按顺序
     if value is None:
         value = cos_sim (self.wordindex2embedding[id_s], self.wordindex2embedding[id_b])
         self.words_Sim[(id_s, id_b)] = value
     return value
Пример #11
0
 def get_p1_sim_sem(self, min_m_id, max_m_id, mTagTrueSem=None):
     """
     # p1:SBS-category-SBS 语义形式
     :param min_m_id:
     :param max_m_id:
     :param mTagTrueSem: tag的“语义形式”:使用各种特征提取方式 可以是'Deep','HDP','TF_IDF'等等
     :return:
     """
     if mTagTrueSem is not None:
         if self.p1_sims_sem is None:
             self.p1_sims_sem = self.load_sims('p1_sims_sem{}.dat'.format(mTagTrueSem))
         if (min_m_id, max_m_id) not in self.p1_sims_sem.keys():
             # print(min_m_id, max_m_id,'not in p1 sim!')
             self.flag1_sem = True
             p1_sim = cos_sim(self.mashup_tag_features[min_m_id], self.mashup_tag_features[max_m_id]) # 利用外部传入的tag feature,应该和参数名一致
             self.p1_sims_sem[(min_m_id, max_m_id)] = p1_sim
         else:
             p1_sim = self.p1_sims_sem[(min_m_id, max_m_id)]
         return p1_sim
Пример #12
0
def Samanta(topK,
            if_pop=2,
            MF_mode='node2vec',
            pop_mode='',
            text_mode='HDP',
            LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs(
            pop_mode)  # TODO

    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path = os.path.join(
        root, 'mashup_{}.txt'.format(text_mode))  # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd = get_default_gd()
        _mashup_features, _api_features = gd.model_pcs(text_mode,
                                                       LDA_topic_num)
        np.savetxt(mashup_feature_path, _mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features = np.loadtxt(mashup_feature_path)
        _api_features = np.loadtxt(api_feature_path)

    candidate_ids_list = []
    all_predict_results = []

    test_data = data_repository.get_ds().test_data
    test_mashup_num = len(test_data.get('mashup'))
    mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df
    api_emb_df = data_repository.get_ds().MF_obj.api_emb_df

    for i in range(test_mashup_num):
        test_m_id = test_data.get('mashup')[i][0]  # 每个mashup id
        candidate_ids = test_data.get('api')[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        mid2sim = {}
        for train_m_id in mashup_emb_df.index.tolist():
            mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id],
                                          _mashup_features[train_m_id])  # TODO
        topK_ids, topK_sims = zip(*(
            sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)  # sim归一化
        cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, ))
        for z in range(len(topK_ids)):
            cf_feature += topK_sims[z] * mashup_emb_df['embedding'][
                topK_ids[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((data_repository.get_args().implict_feat_dim))
        api_ids = set(api_emb_df.index.tolist())
        for api_id in candidate_ids:  # id
            api_i_feature = api_emb_df['embedding'][
                api_id] if api_id in api_ids else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(
                api_i_feature, cf_feature))  # mashup和api latent factor的内积
            sim_score = cos_sim(_mashup_features[test_m_id],
                                _api_features[api_id])  # 特征的余弦相似度
            if if_pop == 1:
                temp_predict_results.append((api_id, cf_score * sim_score))
            elif if_pop == 0:
                predict_results.append(cf_score * sim_score)
            elif if_pop == 2:
                predict_results.append(cf_score * sim_score * api2pop[api_id])
        if if_pop == 1:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    _name = '_pop_{}'.format(if_pop)
    _name += data_repository.get_args().mf_mode
    csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format(
        topK) + _name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(data_repository.get_ds().slt_api_ids_instances[i]
                   ) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(data_repository.get_ds().test_data.get(
                    'all_ground_api_ids')[i])
        return test_api_id_list_, predictions_, grounds_

    if data_repository.get_args().data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #
Пример #13
0
def Samanta(topK,if_pop=2,MF_mode='node2vec',pop_mode='',text_mode='HDP',LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop=None
    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs (pop_mode)

    root = os.path.join(dataset.crt_ds.root_path,'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path=os.path.join(root, 'mashup_{}.txt'.format(text_mode)) # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd=get_default_gd()
        _mashup_features,_api_features=gd.model_pcs(text_mode,LDA_topic_num)
        np.savetxt(mashup_feature_path,_mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features=np.loadtxt(mashup_feature_path)
        _api_features=np.loadtxt(api_feature_path)

    # Para.set_MF_mode(MF_mode) # 设置latent factor
    # new_Para.param.mf_mode = MF_mode # 修改参数对象,慎用

    candidate_ids_list = []
    all_predict_results=[]

    test_mashup_num = len(dataset.crt_ds.test_mashup_id_list)
    for i in range(test_mashup_num):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        localIndex2sim={}
        for local_index,train_m_id in enumerate(dataset.UV_obj.m_ids): # u_factors_matrix要用局部索引
            localIndex2sim[local_index]=cos_sim(_mashup_features[test_mashup_id],_mashup_features[train_m_id])
        topK_indexes,topK_sims=zip(*(sorted(localIndex2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims=np.array(topK_sims)/sum(topK_sims) # sim归一化
        cf_feature=np.zeros((new_Para.param.num_feat,))
        for z in range(len(topK_indexes)):
            cf_feature+= topK_sims[z] * dataset.UV_obj.m_embeddings[topK_indexes[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results=[] # 需要用pop进行重排序时的辅助
        api_zeros=np.zeros((new_Para.param.num_feat))
        for api_id in candidate_ids: # id
            a_id2index = dataset.UV_obj.a_id2index
            api_i_feature= dataset.UV_obj.a_embeddings[a_id2index[api_id]] if api_id in a_id2index.keys() else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score=np.sum(np.multiply(api_i_feature, cf_feature)) # mashup和api latent factor的内积
            sim_score=cos_sim(_mashup_features[test_mashup_id],_api_features[api_id]) # 特征的余弦相似度
            if if_pop==1:
                temp_predict_results.append((api_id,cf_score*sim_score))
            elif if_pop==0:
                predict_results.append(cf_score*sim_score)
            elif if_pop == 2:
                predict_results.append (cf_score * sim_score*api2pop[api_id])
        if if_pop==1:
            max_k_pairs = heapq.nlargest (topK, temp_predict_results, key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip (*max_k_pairs)
            max_k_candidates=set(max_k_candidates)
            predict_results=[api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids] # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    _name='_pop_{}'.format(if_pop)
    _name+= new_Para.param.mf_mode
    csv_table_name = dataset.crt_ds.data_name + 'Samanta_model_{}'.format(topK)+_name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(dataset.crt_ds.slt_api_ids_instances[i]) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(dataset.crt_ds.grounds[i])
        return test_api_id_list_, predictions_, grounds_
    if new_Para.param.data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum+1)
            evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, new_Para.param.topKs)
            summary(new_Para.param.evaluate_path, str(slt_apiNum+1)+'_'+csv_table_name, evaluate_result, new_Para.param.topKs)  #