Пример #1
0
def binary_keyword(text_tag_recommend_model):
    # pop

    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))
    api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs()

    gd = gensim_data(*text_tag_recommend_model.get_instances(
        [i for i in range(all_mashup_num)], [i for i in range(all_api_num)],
        False))
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v(
        all_mashup_num, all_api_num)

    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(mashup_binary_matrix[test_mashup_id],
                                api_binary_matrix[api_id]) * api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'WVSM' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录

    # 测试WJaccard(Weighted Jaccard)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            mashup_set = set(mashup_words_list[test_mashup_id])
            api_set = set(api_words_list[api_id])
            sim_score = 1.0 * len(mashup_set.intersection(api_set)) / len(
                mashup_set.union(api_set)) * api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WJaccard test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'WJaccard' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录
Пример #2
0
def binary_keyword(if_pop = False):
    # pop
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs ()
    gd = get_default_gd()
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v ()


    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            if if_pop:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id]
            else:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    name = 'WVSM_pop' if if_pop else 'WVSM'
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录

    """
Пример #3
0
def TF_IDF(if_pop):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    gd = get_default_gd()
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs ('TF_IDF')

    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            sim_score=cos_sim(_mashup_IFIDF_features[test_mashup_id],_api_IFIDF_features[api_id])
            if if_pop:
                predict_results.append(sim_score*api2pop[api_id])
            else:
                predict_results.append(sim_score )
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    name = 'TFIDF_pop' if if_pop else 'TFIDF'
    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #4
0
def hdp_pop(if_pop = True):
    # pop
    root = os.path.join(dataset.crt_ds.root_path,'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_hdp_path=os.path.join(root, 'mashup_HDP.txt') # ...
    api_hdp_path = os.path.join(root, 'api_HDP.txt')

    _mashup_hdp_features = np.loadtxt (mashup_hdp_path)
    _api_hdp_features = np.loadtxt (api_hdp_path)

    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs ()
    # 测试
    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            sim_score=cos_sim(_mashup_hdp_features[test_mashup_id],_api_hdp_features[api_id])
            if if_pop:
                sim_score *= api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('hdp_pop test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    name = 'hdp_pop' if if_pop else 'hdp'
    csv_table_name = dataset.crt_ds.data_name + name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #5
0
def pop():
    """
    :return:
    """
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs ()
    candidate_ids_list = []
    all_predict_results=[]
    for i in range(len(dataset.crt_ds.test_mashup_id_list)):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids: # id
            predict_results.append(api2pop[api_id])
        all_predict_results.append(predict_results)
    print('pop test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    csv_table_name = dataset.crt_ds.data_name + 'pop' + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #6
0
def TF_IDF(text_tag_recommend_model):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))

    gd = gensim_data(*text_tag_recommend_model.get_instances(
        [i for i in range(all_mashup_num)], [i for i in range(all_api_num)],
        False))
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs(
        'TF_IDF', all_mashup_num, all_api_num)

    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id],
                                _api_IFIDF_features[api_id])
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'TF_IDF' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录
Пример #7
0
def MF(train_datas,test_datas,mode = ''):
    all_predict_results=[] # 每个测试样例(多个api的)的评分
    for slt_num in range(1,new_Para.param.slt_item_num+1): # 不同个数的训练测试集
        test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num-1]
        # 增加处理和读取MF结果的接口
        UV_obj = get_UV(dataset.crt_ds.root_path, mode,train_datas[slt_num-1],slt_num)
        m_id2index,a_id2index = UV_obj.m_id2index,UV_obj.a_id2index
        for i in range(len(test_mashup_id_list)):
            test_mashup_id=test_mashup_id_list[i][0] # 每个mashup id
            predict_results = []
            for test_api_id in test_api_id_list[i]: # id
                if test_mashup_id not in m_id2index or test_api_id not in a_id2index:
                    dot = 0
                else:
                    m_embedding = UV_obj.m_embeddings[m_id2index[test_mashup_id]]
                    a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]]
                    dot = np.dot(m_embedding,a_embedding)
                predict_results.append(dot)
            all_predict_results.append(predict_results)
        print('{}_{} test,done!'.format(mode,slt_num))

        evaluate_result = evalute(test_api_id_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
        csv_table_name = dataset.crt_ds.data_name + mode + str(slt_num)+ "\n"   # model.name
        summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录
Пример #8
0
def run_new_deepFM(CI_feas,
                   NI_feas,
                   train_data,
                   test_data,
                   all_api_num,
                   epoch_num=10):
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # session = tf.Session(config=config)
    # graph = tf.get_default_graph()
    # set_session(session)

    model = simple_DeepFM(CI_feature_num=4,
                          NI_feature_num=2,
                          CI_feature_dim=50,
                          NI_feature_dim=25,
                          final_feature_dim=32,
                          task='binary',
                          use_fm=True,
                          l2_reg_linear=0,
                          dnn_hidden_units=[])
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )
    print('bulid simple_DeepFM,done!')

    batch_size = 32
    len_train = len(train_data[0])

    mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = CI_feas
    mashup_NI_features, api_NI_features = NI_feas

    features = [
        mashup_texts_features, mashup_tag_features, api_texts_features,
        api_tag_features, mashup_NI_features, api_NI_features
    ]
    train_generator = data_generator(train_data,
                                     *features,
                                     bs=batch_size,
                                     all_api_num=all_api_num,
                                     mode="train")
    print('genarate train_generator ,done!')

    # 每训练一次就测试一次
    num_test_instances = getNum_testData(test_data)
    for i in range(epoch_num):
        history = model.fit_generator(train_generator,
                                      steps_per_epoch=len_train // batch_size,
                                      epochs=1,
                                      verbose=2)
        test_generator = data_generator(test_data,
                                        *features,
                                        bs=batch_size,
                                        all_api_num=all_api_num,
                                        mode="test")
        print('genarate test_generator,done!')
        predictions = model.predict_generator(
            test_generator, steps=num_test_instances // batch_size + 1)[:, 1]
        print(predictions.shape)

        reshaped_predictions = []
        # 评价
        test_api_id_list, grounds = test_data[1], test_data[-1]
        index = 0
        for test_api_ids in test_api_id_list:
            size = len(test_api_ids)  # 当前mashup下的候选api的数目
            reshaped_predictions.append(
                predictions[index:index +
                            size])  # min(index + size,len(predictions))
            index += size
        print(index)
        evaluate_result = evalute(test_api_id_list, reshaped_predictions,
                                  grounds, new_Para.param.topKs)  # 评价
        summary(new_Para.param.evaluate_path, 'deepFM_epoch_{}'.format(i),
                evaluate_result, new_Para.param.topKs)  #
Пример #9
0
def Samanta(text_tag_recommend_model, topK, if_pop=False):
    """
    :param Para:
    :param text_tag_recommend_model: 基于该model的基本数据
    :param topK:
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs()

    test_mashup_num = len(Para.test_mashup_id_list)

    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))

    mashup_hdp_path = os.path.join(Para.data_dir, 'mashup_hdp.txt')
    api_hdp_path = os.path.join(Para.data_dir, 'api_hdp.txt')

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_hdp_path):
        # text,tag在encoding之后的向量,array形式
        gd = gensim_data(*text_tag_recommend_model.get_instances(
            [i for i in range(all_mashup_num)],
            [i for i in range(all_api_num)], False))
        _mashup_hdp_features, _api_hdp_features = gd.model_pcs(
            'HDP', all_mashup_num, all_api_num)
        np.savetxt(mashup_hdp_path, _mashup_hdp_features)
        np.savetxt(api_hdp_path, _api_hdp_features)
    else:
        _mashup_hdp_features = np.loadtxt(mashup_hdp_path)
        _api_hdp_features = np.loadtxt(api_hdp_path)

    candidate_ids_list = []
    all_predict_results = []
    for i in range(test_mashup_num):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        id2sim = {}
        for local_train_mashup_index in range(
                len(Para.feature_train_mashup_ids)):  # u_factors_matrix要用局部索引
            id2sim[local_train_mashup_index] = cos_sim(
                _mashup_hdp_features[test_mashup_id], _mashup_hdp_features[
                    Para.feature_train_mashup_ids[local_train_mashup_index]])
        topK_indexes, topK_sims = zip(
            *(sorted(id2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)
        cf_feature = np.zeros((Para.num_feat))
        for z in range(len(topK_indexes)):
            cf_feature += topK_sims[z] * Para.u_factors_matrix[topK_indexes[z]]

        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((Para.num_feat))
        for api_id in candidate_ids:  # id
            api_i_feature = Para.i_factors_matrix[
                Para.i_id2index[api_id]] if api_id in Para.i_id2index.keys(
                ) else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(api_i_feature, cf_feature))
            sim_score = cos_sim(_mashup_hdp_features[test_mashup_id],
                                _api_hdp_features[api_id])
            if if_pop:
                temp_predict_results.append((api_id, cf_score * sim_score))
            else:
                predict_results.append(cf_score * sim_score)

        if if_pop:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 根据score选取topK
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    _name = '_pop' if if_pop else ''
    csv_table_name = Para.data_name + 'Samanta_model' + _name + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录
Пример #10
0
def Samanta(topK,if_pop=2,MF_mode='node2vec',pop_mode='',text_mode='HDP',LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop=None
    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs (pop_mode)

    root = os.path.join(dataset.crt_ds.root_path,'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path=os.path.join(root, 'mashup_{}.txt'.format(text_mode)) # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd=get_default_gd()
        _mashup_features,_api_features=gd.model_pcs(text_mode,LDA_topic_num)
        np.savetxt(mashup_feature_path,_mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features=np.loadtxt(mashup_feature_path)
        _api_features=np.loadtxt(api_feature_path)

    # Para.set_MF_mode(MF_mode) # 设置latent factor
    # new_Para.param.mf_mode = MF_mode # 修改参数对象,慎用

    candidate_ids_list = []
    all_predict_results=[]

    test_mashup_num = len(dataset.crt_ds.test_mashup_id_list)
    for i in range(test_mashup_num):
        test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id
        candidate_ids = dataset.crt_ds.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        localIndex2sim={}
        for local_index,train_m_id in enumerate(dataset.UV_obj.m_ids): # u_factors_matrix要用局部索引
            localIndex2sim[local_index]=cos_sim(_mashup_features[test_mashup_id],_mashup_features[train_m_id])
        topK_indexes,topK_sims=zip(*(sorted(localIndex2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims=np.array(topK_sims)/sum(topK_sims) # sim归一化
        cf_feature=np.zeros((new_Para.param.num_feat,))
        for z in range(len(topK_indexes)):
            cf_feature+= topK_sims[z] * dataset.UV_obj.m_embeddings[topK_indexes[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results=[] # 需要用pop进行重排序时的辅助
        api_zeros=np.zeros((new_Para.param.num_feat))
        for api_id in candidate_ids: # id
            a_id2index = dataset.UV_obj.a_id2index
            api_i_feature= dataset.UV_obj.a_embeddings[a_id2index[api_id]] if api_id in a_id2index.keys() else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score=np.sum(np.multiply(api_i_feature, cf_feature)) # mashup和api latent factor的内积
            sim_score=cos_sim(_mashup_features[test_mashup_id],_api_features[api_id]) # 特征的余弦相似度
            if if_pop==1:
                temp_predict_results.append((api_id,cf_score*sim_score))
            elif if_pop==0:
                predict_results.append(cf_score*sim_score)
            elif if_pop == 2:
                predict_results.append (cf_score * sim_score*api2pop[api_id])
        if if_pop==1:
            max_k_pairs = heapq.nlargest (topK, temp_predict_results, key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip (*max_k_pairs)
            max_k_candidates=set(max_k_candidates)
            predict_results=[api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids] # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs)  # 评价
    _name='_pop_{}'.format(if_pop)
    _name+= new_Para.param.mf_mode
    csv_table_name = dataset.crt_ds.data_name + 'Samanta_model_{}'.format(topK)+_name + "\n"   # model.name
    summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(dataset.crt_ds.slt_api_ids_instances[i]) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(dataset.crt_ds.grounds[i])
        return test_api_id_list_, predictions_, grounds_
    if new_Para.param.data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum+1)
            evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, new_Para.param.topKs)
            summary(new_Para.param.evaluate_path, str(slt_apiNum+1)+'_'+csv_table_name, evaluate_result, new_Para.param.topKs)  #