Exemplo n.º 1
0
def MF(train_datas, test_datas, mode=''):
    all_predict_results = []  # 每个测试样例(多个api的)的评分
    for slt_num in range(1,
                         data_repository.get_args().slt_item_num +
                         1):  # 不同个数的训练测试集
        test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num -
                                                                    1]
        # 增加处理和读取MF结果的接口
        UV_obj = MF(data_repository.get_ds().data_root, mode,
                    train_datas[slt_num - 1], slt_num)
        m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index
        for i in range(len(test_mashup_id_list)):
            test_mashup_id = test_mashup_id_list[i][0]  # 每个mashup id
            predict_results = []
            for test_api_id in test_api_id_list[i]:  # id
                if test_mashup_id not in m_id2index or test_api_id not in a_id2index:
                    dot = 0
                else:
                    m_embedding = UV_obj.m_embeddings[
                        m_id2index[test_mashup_id]]
                    a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]]
                    dot = np.dot(m_embedding, a_embedding)
                predict_results.append(dot)
            all_predict_results.append(predict_results)
        print('{}_{} test,done!'.format(mode, slt_num))

        evaluate_result = evalute(
            test_api_id_list, all_predict_results,
            data_repository.get_ds().test_data.get('all_ground_api_ids'),
            data_repository.get_args().topKs)  # 评价
        csv_table_name = data_repository.get_ds().name + mode + str(
            slt_num) + "\n"  # whole_model.name
        summary(evaluate_path, csv_table_name, evaluate_result,
                data_repository.get_args().topKs)  # 记录
Exemplo n.º 2
0
    def get_true_candi_apis(self):
        # TODO: 没用到
        # 根据IsRec的思想,只把近邻mashup调用过的服务作为候选
        self.mid2candiAids = None
        self.mid2candiAids_path = os.path.join(self.model_dir, 'true_candi_apis.txt')

        if not os.path.exists(self.mid2candiAids_path):
            for key,id2PathSims in self.mID2PathSims.items():
                m_id = key[0] # key = (m_id,tuple(slt_apis_list))
                if m_id not in self.mid2candiAids.keys():
                    all_neighbor_mids = set()
                    for id2sim in id2PathSims: # 到各个剪枝后的候选近邻的某种路径下的相似度
                        num= min(self.neighbor_size,len(id2sim))
                        sorted_id2sim = sorted(id2sim.items(),key=lambda x:x[1],reverse=True) [:num] # 某种路径下的近邻
                        sorted_ids,_ = zip(*sorted_id2sim)
                        all_neighbor_mids = all_neighbor_mids.union(set(sorted_ids))
                    true_candi_apis = set()
                    for neighbor_mid in all_neighbor_mids:
                        if neighbor_mid in data_repository.get_ds().train_mashup_api_dict.keys():
                            true_candi_apis = true_candi_apis.union(set(data_repository.get_ds().train_mashup_api_dict[neighbor_mid])) # 该近邻mashup调用过的api
                    self.mid2candiAids[m_id] = true_candi_apis
            save_dict(self.mid2candiAids_path,self.mid2candiAids)
        else:
            self.mid2candiAids = read_dict(self.mid2candiAids_path)
        return self.mid2candiAids
Exemplo n.º 3
0
def bl_PasRec():
    model_name = 'PasRec_2path'  # 'PasRec_2path'
    epoch_num = 60  # 之前是40  40比20差点
    neighbor_size = 15
    topTopicNum = 3

    args = data_repository.get_args()
    train_data, test_data = data_repository.get_ds(
    ).train_data, data_repository.get_ds().test_data

    HINRec_model = HINRec(args,
                          model_name=model_name,
                          epoch_num=epoch_num,
                          neighbor_size=neighbor_size,
                          topTopicNum=topTopicNum)
    if os.path.exists(HINRec_model.weight_path):
        print('have trained,return!')
    else:
        # 这里是每隔20epoch测试一下,所以train中输入test_data
        HINRec_model.train(test_data)
        HINRec_model.save_model()
        evalute_by_epoch(
            HINRec_model,
            HINRec_model,
            HINRec_model.model_name,
            test_data,
            evaluate_by_slt_apiNum=False)  # ,if_save_recommend_result=True)
Exemplo n.º 4
0
def pop():
    """
    :return:
    """
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            predict_results.append(api2pop[api_id])
        all_predict_results.append(predict_results)
    print('pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + 'pop' + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Exemplo n.º 5
0
def CI_NI_fineTuning():
    args = data_repository.get_args()
    train_data, test_data = data_repository.get_ds(
    ).train_data, data_repository.get_ds().test_data

    CI_recommend_model = CI_Model(args)
    CI_model_obj = CI_recommend_model.get_model()
    CI_model_obj = train_model(CI_recommend_model, CI_model_obj, train_data,
                               test_data, args.train_mode, args.train_new)
Exemplo n.º 6
0
 def divide(slt_apiNum):
     test_api_id_list_, predictions_, grounds_ = [], [], []
     for i in range(test_mashup_num):
         if len(data_repository.get_ds().slt_api_ids_instances[i]
                ) == slt_apiNum:
             test_api_id_list_.append(candidate_ids_list[i])
             predictions_.append(all_predict_results[i])
             grounds_.append(data_repository.get_ds().test_data.get(
                 'all_ground_api_ids')[i])
     return test_api_id_list_, predictions_, grounds_
Exemplo n.º 7
0
        def initialize():
            # 整合text和tag信息:一个mashup/api的信息整合在一起,一行
            if tag_times > 0:
                assert len(mashup_descriptions) == len(mashup_categories)
                self.mashup_dow = []
                for i in range(len(mashup_descriptions)):
                    # 直接将文本和tag拼接,是否有更好的方法
                    self.mashup_dow.append(mashup_descriptions[i] + mashup_categories[i] * tag_times)
            else:
                self.mashup_dow = mashup_descriptions

            if tag_times > 0:
                assert len(api_descriptions) == len(api_categories)
                self.api_dow = []
                for i in range(len(api_descriptions)):
                    self.api_dow.append(api_descriptions[i] + api_categories[i] * tag_times)
            else:
                self.api_dow = api_descriptions

            if self.strict_train:
                # 训练用的mashup,api的编码
                self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in data_repository.get_ds().his_mashup_ids]
                self.dct = Dictionary(self.train_mashup_dow)
                self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in
                                         self.train_mashup_dow]  # 词id-数目
            else:
                self.dct = Dictionary(self.mashup_dow + self.api_dow)

            # 为每个mashup/api计算feature
            self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow]  # 所有mashup文本的词id-数目
            self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow]
Exemplo n.º 8
0
def load_pretrained_model(recommend_model, model):
    """
    只需要载入并返回训练好的模型即可
    :param recommend_model:
    :param para_mode:
    :return:
    """
    with open(
            data_repository.get_ds().new_best_epoch_path.format(
                recommend_model.model_dir), 'r') as f:
        best_epoch = int(f.readline())
    para_path = data_repository.get_ds().new_model_para_path.format(
        recommend_model.model_dir, best_epoch)
    model.load_weights(para_path)
    print('load whole_model:{},done!'.format(recommend_model.simple_name))
    return model
Exemplo n.º 9
0
 def get_name(self):
     """
     用在记录结果部分,记录数据信息+模型信息
     :return:
     """
     if not self.name:
         self.name = data_repository.get_md(
         ).name + '_' + data_repository.get_ds(
         ).name + '_' + self.simple_name
     return self.name
Exemplo n.º 10
0
 def set_embedding_matrixs(self):
     # id->embedding
     self.i_factors_matrix = np.zeros(
         (data_repository.get_md().api_num + 1, self.args.implict_feat_dim))
     api_emb_df = data_repository.get_ds().MF_obj.api_emb_df
     for row in zip(api_emb_df.index.tolist(),
                    api_emb_df.embedding.tolist()):
         id, embedding = row[0], row[1]
         if isinstance(embedding, str):
             embedding = eval(embedding)
         self.i_factors_matrix[id] = embedding
Exemplo n.º 11
0
def binary_keyword(if_pop=False):
    # pop
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    gd = get_default_gd()
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v(
    )

    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            if if_pop:
                sim_score = cos_sim(
                    mashup_binary_matrix[test_mashup_id],
                    api_binary_matrix[api_id]) * api2pop[api_id]
            else:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id],
                                    api_binary_matrix[api_id])  # 测试只使用特征向量的效果
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'WVSM_pop' if if_pop else 'WVSM'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
    """
Exemplo n.º 12
0
 def set_paths(self):
     # 路径设置
     self.model_dir = os.path.join(data_repository.get_ds().data_root,
                                   self.get_simple_name())  # 模型路径
     if not os.path.exists(self.model_dir):
         os.makedirs(self.model_dir)
     self.model_name_path = os.path.join(self.model_dir, 'model_name.dat')
     # self.CI_features_path = os.path.join(self.model_dir, 'CI_features.fea')
     # self.train_slt_apis_mid_features_path = os.path.join(self.model_dir, 'train_slt_apis_mid_features.csv')
     # self.test_slt_apis_mid_features_path = os.path.join(self.model_dir, 'test_slt_apis_mid_features.csv')
     self.ma_text_tag_feas_path = os.path.join(
         self.model_dir,
         'mashup_api_text_tag_feas.dat')  # mashup和api的提取的文本特征
Exemplo n.º 13
0
def TF_IDF(if_pop):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    gd = get_default_gd()
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF')

    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id],
                                _api_IFIDF_features[api_id])
            if if_pop:
                predict_results.append(sim_score * api2pop[api_id])
            else:
                predict_results.append(sim_score)
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    name = 'TFIDF_pop' if if_pop else 'TFIDF'
    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Exemplo n.º 14
0
    def train(self,test_data):
        """
        模仿librec的实现,每个api跟一对正负mashup组成一个样例,每个api的样本数最大为50;(均衡性问题?)
        每20次测试一次,训练数据不用输入,用dataset
        :param test_data:
        :return:
        """
        for index in range(self.epoch_num):
            loss = 0
            for sampleCount in range(len(self.his_a_ids) * self.sample_ratio):  # 每个
                while True:
                    a_id = choice(self.his_a_ids)
                    if len(self.train_aid2mids[a_id]) == len(data_repository.get_ds().his_mashup_ids):  # 如果被所有mashup调用,则没有负例
                        continue
                    pos_m_ids = self.train_aid2mids[a_id]  # 正例
                    pos_m_id = choice(list(pos_m_ids))
                    neg_m_ids = data_repository.get_ds().his_mashup_ids_set - pos_m_ids
                    neg_m_id = choice(list(neg_m_ids))
                    break

                # 训练时计算相似度,已选择的服务应该不包含当前服务
                posPredictRating,posPathScores = self.predict_an_instance(pos_m_id, a_id, data_repository.get_ds().train_mashup_api_dict[pos_m_id]-{a_id})
                negPredictRating,negPathScores = self.predict_an_instance(neg_m_id, a_id, data_repository.get_ds().train_mashup_api_dict[neg_m_id]-{a_id})
                diffValue = posPredictRating - negPredictRating
                deriValue = sigmoid(-diffValue);
                lossValue = -math.log(sigmoid(diffValue))
                loss += lossValue

                for i in range(len(self.path_weights)): # 优化第i条路径对应的权重参数
                    temp_value = self.path_weights[i]
                    self.path_weights[i] += self.learning_rate * (deriValue * (posPathScores[i]-negPathScores[i]) - self.reg * temp_value)
                    loss += self.reg * temp_value * temp_value
            print('epoch:{}, loss:{}'.format(index, loss))

            if index>0 and index%20==0:
                self.test_model(test_data)
Exemplo n.º 15
0
    def process(self, sim_model=None, train_data=None, test_data=None):
        # 准备各种相似度:可以是提供文本和tag特征的CI,也可以是提供相似度支持的HINRec_model
        self.his_mashup_NI_feas = data_repository.get_ds(
        ).MF_obj.mashup_emb_df['embedding'][
            data_repository.get_ds().his_mashup_ids].tolist()  # TODO
        if isinstance(self.his_mashup_NI_feas[0], str):
            self.his_mashup_NI_feas = list(map(eval, self.his_mashup_NI_feas))
        self.his_mashup_NI_feas = np.array(self.his_mashup_NI_feas)
        if self.NI_sim_mode == 'tagSim':  # 基于CI部分的特征计算相似度,MISR使用 TODO
            self.set_mashup_api_features(sim_model)
        else:
            self.m2neighors_path = os.path.join(sim_model.model_dir,
                                                'm2neighors.dat')
            self.m2neighors = {}
            self.path_weights = sim_model.path_weights  # 读取预训练的相似度模型中的meta-path权重

            self.m2AllSimsPath = os.path.join(
                sim_model.model_dir,
                'mID2AllSims_{}.sim'.format(self.NI_sim_mode))
            self.m2ASimPath = os.path.join(
                sim_model.model_dir,
                'mID2ASim_{}_{}_{}.sim'.format(self.NI_sim_mode,
                                               self.path_topK_mode, self.topK))
            self.m2ASim, self.m2AllSims = {}, {}

            all_paths_sim_modes = [
                'PasRec', 'PasRec_2path', 'IsRec', 'IsRec_best'
            ]
            if self.NI_sim_mode in all_paths_sim_modes:  # 计算mashup表示时需要已选择服务()
                self.m2NI_feas = {}
                self.m2NI_feas_path = os.path.join(
                    sim_model.model_dir,
                    'NI_m_id2{}_{}_{}.feas'.format(self.NI_sim_mode,
                                                   self.path_topK_mode,
                                                   self.topK))
                self.get_samples_m_feas(train_data, test_data, sim_model)
Exemplo n.º 16
0
def train_model(recommend_model,
                model,
                train_data,
                test_data,
                train_mode,
                retrain=True,
                true_candidates_dict=None):
    """
    各种模型(完全冷启动和部分冷启动,完整和部分的)都可以通用
    :param recommend_model:
    :param model:
    :param train_data: 与参数对应,是否加入slt_api_ids
    :param test_data:
    :param train_mode: 'best_NDCG' or 'min_loss'
    :param retrain: 是否重新训练模型
    :return:
    """
    # 模型相关的东西都放在该数据下的文件夹下,不同模型不同文件夹!!!

    model_dir = recommend_model.model_dir
    if not os.path.exists(model_dir):
        print('makedirs for:', model_dir)
        os.makedirs(model_dir)

    if os.path.exists(data_repository.get_ds().new_best_epoch_path.format(
            model_dir)) and not retrain:  # 加载求过的结果
        print('preTrained whole_model, exists!')
        return load_pretrained_model(recommend_model, model)
    else:
        if train_mode == 'best_NDCG':
            model = train_best_NDCG_model(
                recommend_model,
                model,
                train_data,
                test_data,
                true_candidates_dict=true_candidates_dict)
        elif train_mode == 'min_loss':
            model = train_early_stop(recommend_model, model, train_data,
                                     test_data)
        elif train_mode == 'monitor loss&acc':
            train_monitoring_loss_acc_model(recommend_model, model, train_data,
                                            test_data)
        else:
            print('wrong train_mode:')
            print(train_mode)
        return model
Exemplo n.º 17
0
    def get_id2PathSims(self,m_id,slt_apis_list=None,if_temp_save=True,if_cutByTopics=True):
        key = (m_id,tuple(slt_apis_list)) if slt_apis_list else m_id
        if key in self.mID2PathSims.keys(): # 重新加载该模型时为空,有必要时为NI即时计算
            return self.mID2PathSims.get(key)
        else:
            his_m_ids = set(data_repository.get_ds().his_mashup_ids)-set([m_id])
            if 'IsRec' in self.simple_name and if_cutByTopics:
                # IsRec是否使用剪枝策略:拥有相同tag的所有mashup中选择近邻
                final_his_m_ids = []
                for topic in self.m_id2topic[m_id]:
                    final_his_m_ids += list(filter(lambda x: x in his_m_ids,self.topic2m_ids[topic]))
                his_m_ids = final_his_m_ids

            if self.simple_name == 'PasRec':
                id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids }
                # 特殊:计算文本相似度时,使用content的topic作为tag,用get_p1_sim
                id2P2Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath', self.m_id2topic) for neigh_m_id in his_m_ids }
                id2P3Sim = {neigh_m_id: self.mhs.get_p3_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids }
                id2P4Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath') for neigh_m_id in his_m_ids}
                # 特殊:计算文本相似度时,使用content的topic作为tag,用get_p4_sim
                id2P5Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath', self.a_id2topic) for neigh_m_id in his_m_ids}
                id2P6Sim = {neigh_m_id: self.mhs.get_p6_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids}
                id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim, id2P4Sim, id2P5Sim, id2P6Sim]  #
            elif self.simple_name == 'IsRec':
                id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids}
                id2P2Sim = {neigh_m_id: self.mhs.get_p2_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'EmbMax') for neigh_m_id in his_m_ids}
                id2P3Sim = {neigh_m_id: self.mhs.get_p3_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids}
                id2P4Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath') for neigh_m_id in  his_m_ids}
                id2P5Sim = {neigh_m_id: self.mhs.get_p5_sim(neigh_m_id, slt_apis_list, 'EmbMax') for neigh_m_id in his_m_ids}
                id2P6Sim = {neigh_m_id: self.mhs.get_p6_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids}
                id2P7Sim = {neigh_m_id: self.mhs.get_p2_sim_sem(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'TF_IDF') for neigh_m_id in his_m_ids}
                id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim, id2P4Sim, id2P5Sim, id2P6Sim,id2P7Sim]  #
            elif self.simple_name == 'PasRec_2path':
                id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids}
                id2P2Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath', self.m_id2topic) for neigh_m_id in his_m_ids}
                id2PathSims = [id2P1Sim, id2P2Sim]  #
            elif self.simple_name == 'IsRec_best':
                id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids}
                id2P2Sim = {neigh_m_id: self.mhs.get_p2_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'EmbMax') for neigh_m_id in his_m_ids}
                id2P3Sim = {neigh_m_id: self.mhs.get_p2_sim_sem(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'TF_IDF') for neigh_m_id in his_m_ids}
                id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim]  #
            if if_temp_save:
                self.mID2PathSims[key] = id2PathSims
            return id2PathSims
Exemplo n.º 18
0
def train_early_stop(recommend_model, model, train_data, test_data):
    """
    训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试
    :return:
    """
    if_Train = True if data_repository.get_args().pairwise else False
    train_labels = train_data[-1]
    train_instances_tuple = recommend_model.get_instances(
        *train_data[:-1], pairwise_train_phase_flag=if_Train)

    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model
    if data_repository.get_args().pairwise:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss=lambda y_true, y_pred: y_pred,
                            metrics=['accuracy'])
    else:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss='binary_crossentropy',
                            metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=10,
                                   verbose=2,
                                   mode='min')
    hist = train_model.fit(
        [*train_instances_tuple],
        train_labels,
        epochs=data_repository.get_args().num_epochs,
        batch_size=data_repository.get_args().small_batch_size,
        callbacks=[early_stopping],
        validation_split=data_repository.get_args().validation_split,
        shuffle=True)  #
    model.save_weights(data_repository.get_ds().new_model_para_path.format(
        recommend_model.model_dir, 'min_loss'))  # !!! 改正

    model_name = recommend_model.get_simple_name() + recommend_model.get_name(
    ) + '_min_loss'
    save_loss_acc(hist, model_name, if_multi_epoch=True)

    epoch_evaluate_result = evalute_by_epoch(recommend_model, model,
                                             model_name, test_data)
    return model
Exemplo n.º 19
0
def hdp_pop(if_pop=True):
    # pop
    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt')  # ...
    api_hdp_path = os.path.join(root, 'api_HDP.txt')

    _mashup_hdp_features = np.loadtxt(mashup_hdp_path)
    _api_hdp_features = np.loadtxt(api_hdp_path)

    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    # 测试
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_hdp_features[test_mashup_id],
                                _api_hdp_features[api_id])
            if if_pop:
                sim_score *= api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('hdp_pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'hdp_pop' if if_pop else 'hdp'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Exemplo n.º 20
0
 def set_paths(self):
     self.model_dir = data_repository.get_ds().model_path.format(
         self.get_simple_name())  # 模型路径
Exemplo n.º 21
0
 def set_paths(self):
     self.model_dir = os.path.join(data_repository.get_ds().data_root,self.get_simple_name()) # 模型路径
     if not os.path.exists(self.model_dir):
         os.makedirs(self.model_dir)
     self.model_name_path = os.path.join(self.model_dir, 'model_name.dat')
Exemplo n.º 22
0
def Samanta(topK,
            if_pop=2,
            MF_mode='node2vec',
            pop_mode='',
            text_mode='HDP',
            LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs(
            pop_mode)  # TODO

    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path = os.path.join(
        root, 'mashup_{}.txt'.format(text_mode))  # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd = get_default_gd()
        _mashup_features, _api_features = gd.model_pcs(text_mode,
                                                       LDA_topic_num)
        np.savetxt(mashup_feature_path, _mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features = np.loadtxt(mashup_feature_path)
        _api_features = np.loadtxt(api_feature_path)

    candidate_ids_list = []
    all_predict_results = []

    test_data = data_repository.get_ds().test_data
    test_mashup_num = len(test_data.get('mashup'))
    mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df
    api_emb_df = data_repository.get_ds().MF_obj.api_emb_df

    for i in range(test_mashup_num):
        test_m_id = test_data.get('mashup')[i][0]  # 每个mashup id
        candidate_ids = test_data.get('api')[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        mid2sim = {}
        for train_m_id in mashup_emb_df.index.tolist():
            mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id],
                                          _mashup_features[train_m_id])  # TODO
        topK_ids, topK_sims = zip(*(
            sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)  # sim归一化
        cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, ))
        for z in range(len(topK_ids)):
            cf_feature += topK_sims[z] * mashup_emb_df['embedding'][
                topK_ids[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((data_repository.get_args().implict_feat_dim))
        api_ids = set(api_emb_df.index.tolist())
        for api_id in candidate_ids:  # id
            api_i_feature = api_emb_df['embedding'][
                api_id] if api_id in api_ids else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(
                api_i_feature, cf_feature))  # mashup和api latent factor的内积
            sim_score = cos_sim(_mashup_features[test_m_id],
                                _api_features[api_id])  # 特征的余弦相似度
            if if_pop == 1:
                temp_predict_results.append((api_id, cf_score * sim_score))
            elif if_pop == 0:
                predict_results.append(cf_score * sim_score)
            elif if_pop == 2:
                predict_results.append(cf_score * sim_score * api2pop[api_id])
        if if_pop == 1:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    _name = '_pop_{}'.format(if_pop)
    _name += data_repository.get_args().mf_mode
    csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format(
        topK) + _name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(data_repository.get_ds().slt_api_ids_instances[i]
                   ) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(data_repository.get_ds().test_data.get(
                    'all_ground_api_ids')[i])
        return test_api_id_list_, predictions_, grounds_

    if data_repository.get_args().data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #
Exemplo n.º 23
0
def train_best_NDCG_model(recommend_model,
                          model,
                          train_data,
                          test_data,
                          true_candidates_dict=None,
                          CI_start_test_epoch=0,
                          earlyStop_epochs=5):
    """
    训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型
    :param recommend_model:  整体的推荐模型
    :param model:  model_core
    :param train_data:
    :param test_data:
    :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练
    :param true_candidates_dict:
    :return:
    """
    print('training_save_best_NDCG_model...')
    epoch_evaluate_results = []

    # 模型
    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model

    # 数据
    train_instances_dict = recommend_model.get_instances(
        train_data,
        pairwise_train_phase_flag=data_repository.get_args().pairwise)
    train_labels = train_data.get('label')
    if data_repository.get_args(
    ).final_activation == 'softmax':  # 针对softmax变换labels
        train_labels = utils.to_categorical(train_labels, num_classes=2)

    best_epoch, best_NDCG_5 = 0, 0
    for epoch in range(data_repository.get_args().num_epochs):
        if epoch == 0:  # 首次训练要编译
            # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy'
            # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy'])
            train_model.compile(optimizer=recommend_model.optimizer,
                                loss='binary_crossentropy',
                                metrics=['accuracy'])
            print('whole_model compile,done!')
        print('Epoch {}'.format(epoch))

        hist = train_model.fit(
            train_instances_dict,
            np.array(train_labels),
            batch_size=data_repository.get_args().batch_size,
            epochs=1,
            verbose=1,
            shuffle=True,
            validation_split=data_repository.get_args().validation_split)
        print('Epoch {}, train done!'.format(epoch))

        # 记录:数据集情况,模型架构,训练设置
        record_name = recommend_model.get_name() + data_repository.get_args(
        ).train_name if epoch == 0 else ''  # 记录在测试集的效果,写入evalute.csv
        save_loss_acc(hist, record_name, epoch=epoch)  # 每个epoch记录

        # CI的前3轮效果差,一般不用测,提高速度
        first_test_epoch = CI_start_test_epoch if isinstance(
            recommend_model, CI_Model) else 0
        if epoch < first_test_epoch:
            epoch_evaluate_results.append(None)
            continue

        # epoch测试
        epoch_evaluate_result = evalute_by_epoch(
            recommend_model,
            model,
            record_name,
            test_data,
            record_time=True if epoch == 0 else False,
            true_candidates_dict=true_candidates_dict)
        epoch_evaluate_results.append(epoch_evaluate_result)

        # 优于目前的best_NDCG_5才存储模型参数 TODO
        if epoch_evaluate_result[0][3] >= best_NDCG_5:
            best_NDCG_5 = epoch_evaluate_result[0][3]
            best_epoch = epoch
            model.save_weights(
                data_repository.get_ds().new_model_para_path.format(
                    recommend_model.model_dir, epoch))
        else:
            if epoch - best_epoch >= earlyStop_epochs:  # 大于若干个epoch,效果没有提升,即时终止
                break

    # 记录最优epoch和最优NDCG@5
    with open(
            data_repository.get_ds().new_best_epoch_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_epoch))
    with open(
            data_repository.get_ds().new_best_NDCG_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_NDCG_5))
    print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5))

    # 记录最优指标
    csv_table_name = 'best_indicaters\n'
    summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch],
            data_repository.get_args().topKs)

    # 看word embedding矩阵是否发生改变,尤其是padding的0
    # print('some embedding parameters after {} epoch:'.format(epoch))
    # print (recommend_model.embedding_layer.get_weights ()[0][:2])

    # 把记录的非最优的epoch模型参数都删除
    try:
        for i in range(data_repository.get_args().num_epochs):
            temp_path = data_repository.get_ds().new_model_para_path.format(
                recommend_model.model_dir, i)
            if i != best_epoch and os.path.exists(temp_path):
                os.remove(temp_path)
        model.load_weights(data_repository.get_ds().new_model_para_path.format(
            recommend_model.model_dir, best_epoch))
    finally:
        return model
Exemplo n.º 24
0
    def __init__(self,args,model_name = 'PasRec',semantic_mode='HDP',LDA_topic_num='',epoch_num=15,neighbor_size=15,topTopicNum=3,cluster_mode='LDA',cluster_mode_topic_num=100):
        # semantic_mode='HDP',LDA_topic_num=None: about feature in HIN
        # cluster_mode='LDA',cluster_mode_topic_num: ABOUT clustering by LDA...

        self.simple_name = model_name
        if self.simple_name == 'IsRec_best':
            self.p1_weight, self.p2_weight, self.p3_weight = 1/3,1/3,1/3
            self.path_weights = [self.p1_weight, self.p2_weight, self.p3_weight]
        elif self.simple_name == 'PasRec_2path':
            self.p1_weight, self.p2_weight = 1/2,1/2
            self.path_weights = [self.p1_weight, self.p2_weight]
        elif self.simple_name == 'IsRec':
            self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight,self.p7_weight = 1/7,1/7,1/7,1/7,1/7,1/7,1/7
            self.path_weights = [self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight,self.p7_weight]
        else :
            self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight = 1/6,1/6,1/6,1/6,1/6,1/6
            self.path_weights = [self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight]

        self.neighbor_size = neighbor_size # 找最近邻时的规模
        self.epoch_num = epoch_num
        self.learning_rate = 0.001
        self.reg=0.001
        self.sample_ratio = 50 # pairwise优化,每个api对应的训练pair数目

        self.model_name = '{}_{}_epoch{}_nbSize{}TopicNum{}{}{}'.format(self.simple_name,semantic_mode, epoch_num, neighbor_size,topTopicNum,cluster_mode,cluster_mode_topic_num)
        self.model_dir = data_repository.get_ds().model_path.format(self.model_name) # 模型路径 # !!!
        self.weight_path = os.path.join(self.model_dir, 'weights.npy')  # 最核心的数据,只保存它,其他无用!

        # 训练数据集 api_id: set(mashup_ids)
        self.train_aid2mids = {}
        for mashup_id, api_id in data_repository.get_ds().train_mashup_api_list:
            if api_id not in self.train_aid2mids.keys():
                self.train_aid2mids[api_id] = set()
            self.train_aid2mids[api_id].add(mashup_id)
        self.his_a_ids = list(self.train_aid2mids.keys())  # 训练数据集中出现的api_id !!!
        self.notInvokeScore = 0 # 加入评价的api是历史mashup从未调用过的,基准评分0;参考1和0  0.5很差!!!

        # 文本,HIN相似度相关
        self.HIN_path = os.path.join(self.model_dir, 'HIN_sims') # 存储各个HIN_sim源文件的root !!!
        self.semantic_mode = semantic_mode
        self.LDA_topic_num = LDA_topic_num

        # HIN中 文本相似度计算  只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax!!!
        HIN_gd = get_default_gd(tag_times=2,strict_train=False)
        embedding_matrix = get_embedding_matrix(HIN_gd.dct.token2id, args.embedding_name,dimension=args.embedding_dim) # 每个编码词的embedding
        HIN_gd.model_pcs(model_name = self.semantic_mode,LDA_topic_num =self.LDA_topic_num) # IsRec_best需要使用TF_IDF
        HIN_gd.get_all_encoded_comments()
        self.mhs = mashup_HIN_sims(embedding_matrix, gd = HIN_gd, semantic_name=self.semantic_mode,
                                   HIN_path=self.HIN_path,features=(HIN_gd._mashup_features, HIN_gd._api_features),
                                   if_text_sem=True,if_tag_sem=False)
        self.mID2PathSims={} # 每个mashupID(含已调用apis),跟历史mashup的各种路径的相似度
        self.HIN_sims_changed_flag = False

        # topTopicNum在PasRec中用于基于LDA等的主题计算content相似度;在IsRec中用于从K个类中寻找近邻!!!
        self.topTopicNum = topTopicNum
        topic_gd = get_default_gd(tag_times=0,strict_train=True) # 用gensim处理文本,文本中不加tag
        topic_gd.model_pcs(model_name = cluster_mode,LDA_topic_num =cluster_mode_topic_num) # 暂时用HDP分类/提取特征;确定主题数之后改成LDA
        self.m_id2topic,self.a_id2topic = topic_gd.get_topTopics(self.topTopicNum)
        # 全部mashup: topic到mashup的映射;相当于按主题分类
        self.topic2m_ids = {}
        for m_id,topic_indexes in enumerate(self.m_id2topic):
            for topic_index in topic_indexes:
                if topic_index not in self.topic2m_ids:
                    self.topic2m_ids[topic_index] = []
                self.topic2m_ids[topic_index].append(m_id)

        self.read_model() # 主要读取权重参数,其他不重要
Exemplo n.º 25
0
    def get_m2ASim(self, train_data, test_data, sim_model):
        """得到一个mashup到其他mashup的归一化的综合相似度向量"""
        if os.path.exists(self.m2neighors_path) and os.path.exists(
                self.m2ASimPath):  # ...计算explicit用
            with open(self.m2ASimPath, 'rb') as f:
                self.m2ASim = pickle.load(f)
            with open(self.m2neighors_path, 'rb') as f:
                self.m2neighors = pickle.load(f)
        else:  # 一次性计算全部的并存储
            print('m2ASim not exist, computing!')
            dict_ = self.get_m2AllSims(
                train_data, test_data,
                sim_model)  # self.m2AllSims 每个sample的相似度映射
            for key, id2PathSims in dict_.items():
                m_id = key if isinstance(key, int) else key[0]  # mashup ID

                if self.path_topK_mode == 'eachPathTopK':  # 每个路径的topK
                    for i in range(len(id2PathSims)):  # 某一种路径的相似度
                        id2PathSim = id2PathSims[i]
                        num = min(self.topK, len(id2PathSim))
                        id2PathSim = sorted(id2PathSim.items(),
                                            key=lambda x: x[1],
                                            reverse=True)[:num]
                        id2PathSims[i] = {
                            key: value
                            for key, value in id2PathSim
                        }

                id2score = {
                    his_m_id: 0
                    for his_m_id in data_repository.get_ds().his_mashup_ids
                }  # 到所有历史mashup的综合相似度
                for his_m_id in id2score.keys():  # 每个历史近邻mashup
                    if his_m_id != m_id:  # 除去自身
                        for path_index, id2aPathSim in enumerate(
                                id2PathSims):  # 每种相似度路径
                            pathSim = 0 if his_m_id not in id2aPathSim.keys(
                            ) else id2aPathSim[his_m_id]  # 某个历史mid可能没有某种相似度
                            id2score[his_m_id] += pathSim * self.path_weights[
                                path_index]

                # 为显式设计,综合所有路径之后存储topk近邻
                num = min(self.topK, len(id2score))
                self.m2neighors[key], _ = zip(
                    *(sorted(id2score.items(),
                             key=lambda x: x[1],
                             reverse=True)[:num]))  # 按顺序存储topK个近邻的ID

                if self.path_topK_mode == 'allPathsTopK':  # 最终所有路径综合评分的topK
                    num = min(self.topK, len(id2score))
                    id2score = sorted(id2score.items(),
                                      key=lambda x: x[1],
                                      reverse=True)[:num]
                    id2score = {key: value for key, value in id2score}

                sims = np.array([
                    id2score[his_m_id] if his_m_id in id2score.keys() else 0
                    for his_m_id in data_repository.get_ds().his_mashup_ids
                ])  # 按顺序排好的sims: (#his_m_ids)
                sum_sim = sum(sims)
                if sum_sim == 0:
                    print('sims sum=0!')
                else:
                    sims = sims / sum_sim
                self.m2ASim[key] = sims

            print('m2ASim, computed!')
            with open(self.m2ASimPath, 'wb') as f:
                pickle.dump(self.m2ASim, f)

            with open(self.m2neighors_path, 'wb') as f:
                pickle.dump(self.m2neighors, f)
        return self.m2ASim