示例#1
0
def pop():
    """
    :return:
    """
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            predict_results.append(api2pop[api_id])
        all_predict_results.append(predict_results)
    print('pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + 'pop' + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
示例#2
0
def MF(train_datas, test_datas, mode=''):
    all_predict_results = []  # 每个测试样例(多个api的)的评分
    for slt_num in range(1,
                         data_repository.get_args().slt_item_num +
                         1):  # 不同个数的训练测试集
        test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num -
                                                                    1]
        # 增加处理和读取MF结果的接口
        UV_obj = MF(data_repository.get_ds().data_root, mode,
                    train_datas[slt_num - 1], slt_num)
        m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index
        for i in range(len(test_mashup_id_list)):
            test_mashup_id = test_mashup_id_list[i][0]  # 每个mashup id
            predict_results = []
            for test_api_id in test_api_id_list[i]:  # id
                if test_mashup_id not in m_id2index or test_api_id not in a_id2index:
                    dot = 0
                else:
                    m_embedding = UV_obj.m_embeddings[
                        m_id2index[test_mashup_id]]
                    a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]]
                    dot = np.dot(m_embedding, a_embedding)
                predict_results.append(dot)
            all_predict_results.append(predict_results)
        print('{}_{} test,done!'.format(mode, slt_num))

        evaluate_result = evalute(
            test_api_id_list, all_predict_results,
            data_repository.get_ds().test_data.get('all_ground_api_ids'),
            data_repository.get_args().topKs)  # 评价
        csv_table_name = data_repository.get_ds().name + mode + str(
            slt_num) + "\n"  # whole_model.name
        summary(evaluate_path, csv_table_name, evaluate_result,
                data_repository.get_args().topKs)  # 记录
示例#3
0
    def recommend(self, topKs=[5]):
        """
        外部调用的推荐方法
        :param cluster_threshold:
        :param topKs: NDCG@5 / 10
        :return:
        """
        self.update_embedding_paras()  # 先更新embedding 参数,可以计算sim
        self.clustering()  # 10个类

        all_indicators = []  # 第一维是mashup;第二维是top5/10;第三维是每个指标

        test_m_num = len(data_repository.get_ds().test_mashup_id_list)
        for i in range(test_m_num):
            a_mashup_indicators = []

            test_m_id = data_repository.get_ds().test_mashup_id_list[i][
                0]  # 每个mashup id
            query_feature = self.m_features[test_m_id]  # 对每个测试的mashup/query

            self.cls_deduction(query_feature,
                               a_wight=0.6,
                               b_wight=0.15,
                               topK=5)  # 裁剪和挑选最近似cls  每个类选择几个

            for topK in topKs:  # NDCG@5 or NDCG@10
                candidate_results = combinations([
                    self.deduct_cls[cls_index]
                    for cls_index in self.sorted_cls_index[:topK]
                ]).get_results()  # 选择topK个类进行组合
                index2utity = {}
                for index in range(len(candidate_results)):
                    index2utity[index] = self.score_set(
                        candidate_results[index])
                sorted_index = sorted(
                    [i for i in range(len(candidate_results))],
                    key=lambda index: index2utity[index],
                    reverse=True)  # 最终推荐的
                max_k_candidates = candidate_results[
                    sorted_index[0]]  # 这里只使用一个set?
                # print(max_k_candidates)

                # 调用evalute函数
                a_mashup_indicators.append(
                    list(
                        evaluate(max_k_candidates,
                                 data_repository.get_ds().grounds[i],
                                 topK)))  # 评价得到五个指标,K对NDCG等有用

            all_indicators.append(a_mashup_indicators)
        all_indicators = np.average(all_indicators, axis=0)
        summary(new_Para.param.evaluate_path,
                'set_rec_{}_clustingTs:{}_clsNum:{}'.format(
                    self.cluster_method, self.cluster_ts,
                    self.cls_num), all_indicators, topKs)  # 名字中再加区别模型的参数
示例#4
0
def hdp_pop(if_pop=True):
    # pop
    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt')  # ...
    api_hdp_path = os.path.join(root, 'api_HDP.txt')

    _mashup_hdp_features = np.loadtxt(mashup_hdp_path)
    _api_hdp_features = np.loadtxt(api_hdp_path)

    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    # 测试
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_hdp_features[test_mashup_id],
                                _api_hdp_features[api_id])
            if if_pop:
                sim_score *= api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('hdp_pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'hdp_pop' if if_pop else 'hdp'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
示例#5
0
def binary_keyword(if_pop=False):
    # pop
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    gd = get_default_gd()
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v(
    )

    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            if if_pop:
                sim_score = cos_sim(
                    mashup_binary_matrix[test_mashup_id],
                    api_binary_matrix[api_id]) * api2pop[api_id]
            else:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id],
                                    api_binary_matrix[api_id])  # 测试只使用特征向量的效果
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'WVSM_pop' if if_pop else 'WVSM'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
    """
示例#6
0
def TF_IDF(if_pop):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    gd = get_default_gd()
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF')

    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id],
                                _api_IFIDF_features[api_id])
            if if_pop:
                predict_results.append(sim_score * api2pop[api_id])
            else:
                predict_results.append(sim_score)
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    name = 'TFIDF_pop' if if_pop else 'TFIDF'
    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
示例#7
0
def train_best_NDCG_model(recommend_model,
                          model,
                          train_data,
                          test_data,
                          true_candidates_dict=None,
                          CI_start_test_epoch=0,
                          earlyStop_epochs=5):
    """
    训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型
    :param recommend_model:  整体的推荐模型
    :param model:  model_core
    :param train_data:
    :param test_data:
    :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练
    :param true_candidates_dict:
    :return:
    """
    print('training_save_best_NDCG_model...')
    epoch_evaluate_results = []

    # 模型
    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model

    # 数据
    train_instances_dict = recommend_model.get_instances(
        train_data,
        pairwise_train_phase_flag=data_repository.get_args().pairwise)
    train_labels = train_data.get('label')
    if data_repository.get_args(
    ).final_activation == 'softmax':  # 针对softmax变换labels
        train_labels = utils.to_categorical(train_labels, num_classes=2)

    best_epoch, best_NDCG_5 = 0, 0
    for epoch in range(data_repository.get_args().num_epochs):
        if epoch == 0:  # 首次训练要编译
            # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy'
            # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy'])
            train_model.compile(optimizer=recommend_model.optimizer,
                                loss='binary_crossentropy',
                                metrics=['accuracy'])
            print('whole_model compile,done!')
        print('Epoch {}'.format(epoch))

        hist = train_model.fit(
            train_instances_dict,
            np.array(train_labels),
            batch_size=data_repository.get_args().batch_size,
            epochs=1,
            verbose=1,
            shuffle=True,
            validation_split=data_repository.get_args().validation_split)
        print('Epoch {}, train done!'.format(epoch))

        # 记录:数据集情况,模型架构,训练设置
        record_name = recommend_model.get_name() + data_repository.get_args(
        ).train_name if epoch == 0 else ''  # 记录在测试集的效果,写入evalute.csv
        save_loss_acc(hist, record_name, epoch=epoch)  # 每个epoch记录

        # CI的前3轮效果差,一般不用测,提高速度
        first_test_epoch = CI_start_test_epoch if isinstance(
            recommend_model, CI_Model) else 0
        if epoch < first_test_epoch:
            epoch_evaluate_results.append(None)
            continue

        # epoch测试
        epoch_evaluate_result = evalute_by_epoch(
            recommend_model,
            model,
            record_name,
            test_data,
            record_time=True if epoch == 0 else False,
            true_candidates_dict=true_candidates_dict)
        epoch_evaluate_results.append(epoch_evaluate_result)

        # 优于目前的best_NDCG_5才存储模型参数 TODO
        if epoch_evaluate_result[0][3] >= best_NDCG_5:
            best_NDCG_5 = epoch_evaluate_result[0][3]
            best_epoch = epoch
            model.save_weights(
                data_repository.get_ds().new_model_para_path.format(
                    recommend_model.model_dir, epoch))
        else:
            if epoch - best_epoch >= earlyStop_epochs:  # 大于若干个epoch,效果没有提升,即时终止
                break

    # 记录最优epoch和最优NDCG@5
    with open(
            data_repository.get_ds().new_best_epoch_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_epoch))
    with open(
            data_repository.get_ds().new_best_NDCG_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_NDCG_5))
    print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5))

    # 记录最优指标
    csv_table_name = 'best_indicaters\n'
    summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch],
            data_repository.get_args().topKs)

    # 看word embedding矩阵是否发生改变,尤其是padding的0
    # print('some embedding parameters after {} epoch:'.format(epoch))
    # print (recommend_model.embedding_layer.get_weights ()[0][:2])

    # 把记录的非最优的epoch模型参数都删除
    try:
        for i in range(data_repository.get_args().num_epochs):
            temp_path = data_repository.get_ds().new_model_para_path.format(
                recommend_model.model_dir, i)
            if i != best_epoch and os.path.exists(temp_path):
                os.remove(temp_path)
        model.load_weights(data_repository.get_ds().new_model_para_path.format(
            recommend_model.model_dir, best_epoch))
    finally:
        return model
示例#8
0
def run_new_deepFM(CI_feas,
                   NI_feas,
                   train_data,
                   test_data,
                   all_api_num,
                   epoch_num=10):
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # session = tf.Session(config=config)
    # graph = tf.get_default_graph()
    # set_session(session)

    model = simple_DeepFM(CI_feature_num=4,
                          NI_feature_num=2,
                          CI_feature_dim=50,
                          NI_feature_dim=25,
                          final_feature_dim=32,
                          task='binary',
                          use_fm=True,
                          l2_reg_linear=0,
                          dnn_hidden_units=[])
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )
    print('bulid simple_DeepFM,done!')

    batch_size = 32
    len_train = len(train_data[0])

    mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = CI_feas
    mashup_NI_features, api_NI_features = NI_feas

    features = [
        mashup_texts_features, mashup_tag_features, api_texts_features,
        api_tag_features, mashup_NI_features, api_NI_features
    ]
    train_generator = data_generator(train_data,
                                     *features,
                                     bs=batch_size,
                                     all_api_num=all_api_num,
                                     mode="train")
    print('genarate train_generator ,done!')

    # 每训练一次就测试一次
    num_test_instances = getNum_testData(test_data)
    for i in range(epoch_num):
        history = model.fit_generator(train_generator,
                                      steps_per_epoch=len_train // batch_size,
                                      epochs=1,
                                      verbose=2)
        test_generator = data_generator(test_data,
                                        *features,
                                        bs=batch_size,
                                        all_api_num=all_api_num,
                                        mode="test")
        print('genarate test_generator,done!')
        predictions = model.predict_generator(
            test_generator, steps=num_test_instances // batch_size + 1)[:, 1]
        print(predictions.shape)

        reshaped_predictions = []
        # 评价
        test_api_id_list, grounds = test_data[1], test_data[-1]
        index = 0
        for test_api_ids in test_api_id_list:
            size = len(test_api_ids)  # 当前mashup下的候选api的数目
            reshaped_predictions.append(
                predictions[index:index +
                            size])  # min(index + size,len(predictions))
            index += size
        print(index)
        evaluate_result = evalute(test_api_id_list, reshaped_predictions,
                                  grounds, new_Para.param.topKs)  # 评价
        summary(new_Para.param.evaluate_path, 'deepFM_epoch_{}'.format(i),
                evaluate_result, new_Para.param.topKs)  #
示例#9
0
def Samanta(topK,
            if_pop=2,
            MF_mode='node2vec',
            pop_mode='',
            text_mode='HDP',
            LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs(
            pop_mode)  # TODO

    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path = os.path.join(
        root, 'mashup_{}.txt'.format(text_mode))  # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd = get_default_gd()
        _mashup_features, _api_features = gd.model_pcs(text_mode,
                                                       LDA_topic_num)
        np.savetxt(mashup_feature_path, _mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features = np.loadtxt(mashup_feature_path)
        _api_features = np.loadtxt(api_feature_path)

    candidate_ids_list = []
    all_predict_results = []

    test_data = data_repository.get_ds().test_data
    test_mashup_num = len(test_data.get('mashup'))
    mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df
    api_emb_df = data_repository.get_ds().MF_obj.api_emb_df

    for i in range(test_mashup_num):
        test_m_id = test_data.get('mashup')[i][0]  # 每个mashup id
        candidate_ids = test_data.get('api')[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        mid2sim = {}
        for train_m_id in mashup_emb_df.index.tolist():
            mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id],
                                          _mashup_features[train_m_id])  # TODO
        topK_ids, topK_sims = zip(*(
            sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)  # sim归一化
        cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, ))
        for z in range(len(topK_ids)):
            cf_feature += topK_sims[z] * mashup_emb_df['embedding'][
                topK_ids[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((data_repository.get_args().implict_feat_dim))
        api_ids = set(api_emb_df.index.tolist())
        for api_id in candidate_ids:  # id
            api_i_feature = api_emb_df['embedding'][
                api_id] if api_id in api_ids else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(
                api_i_feature, cf_feature))  # mashup和api latent factor的内积
            sim_score = cos_sim(_mashup_features[test_m_id],
                                _api_features[api_id])  # 特征的余弦相似度
            if if_pop == 1:
                temp_predict_results.append((api_id, cf_score * sim_score))
            elif if_pop == 0:
                predict_results.append(cf_score * sim_score)
            elif if_pop == 2:
                predict_results.append(cf_score * sim_score * api2pop[api_id])
        if if_pop == 1:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    _name = '_pop_{}'.format(if_pop)
    _name += data_repository.get_args().mf_mode
    csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format(
        topK) + _name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(data_repository.get_ds().slt_api_ids_instances[i]
                   ) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(data_repository.get_ds().test_data.get(
                    'all_ground_api_ids')[i])
        return test_api_id_list_, predictions_, grounds_

    if data_repository.get_args().data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #