Пример #1
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    length = len(rec_api_test)
    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 40
        for idx in range(n_queries):
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(length):
        try:
            y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        except ValueError:
            predict = [0.0 for n in range(length)]
        else:
            predict.append(float(y_pre[0, 1]))

    return predict, X, new_X_feedback, new_y_feedback
Пример #2
0
def process_input(msg='how to convert int to string?'):
    global rerank
    global responseToClient
    global g_query_str
    global feedback_inf
    global api_feature
    global sort
    query = msg
    query_matrix, query_idf_vector = feedback.load_matrix(query, w2v, idf)

    top_questions = recommendation.get_topk_questions(query, query_matrix, query_idf_vector, questions, 50, parent)
    recommended_api = recommendation.recommend_api(query_matrix, query_idf_vector, top_questions, questions, javadoc,javadoc_dict_methods,-1)
    # recommended_api = recommendation.recommend_api_class(query_matrix, query_idf_vector, top_questions, questions, javadoc,javadoc_dict_classes,-1)    

    # combine api_relevant feature with FF
    pos = -1
    rec_api = []
    api_dict_desc = {}
    x, api_feature, responseToClient = [], [], []
    for i,api in enumerate(recommended_api):
        # print('Rank',i+1,':',api)
        rec_api.append(api)
        # recommendation.summarize_api_method(api,top_questions,questions,javadoc,javadoc_dict_methods)
        api_descriptions, questions_titles = recommendation.summarize_api_method(api, top_questions, questions, javadoc,
                                                                                 javadoc_dict_methods)
        
        api_dict_desc[api] = api_descriptions

        sum_inf, api_inf, api_desc_inf = text2feat(api, api_descriptions, w2v, idf, query_matrix, query_idf_vector)
        api_feature.append(sum_inf)
        # print(api_feature)
        if i == 9:
            break
    # print('##################')

    start1 = time.time()
    # feedback info of user query from SO
    fr = open('../data/feedback_all.csv', 'r')
    reader = csv.reader(fr)
    so_query, so_answer = [], []
    for row in reader:
        so_query.append(row[0])
        so_answer.append(row[1:])

    # feedback info of user query from FR
    fr = open('../data/feedback_rec.csv', 'r')
    reader = csv.reader(fr)
    choose_query, choose_answer = [], []
    for row in reader:
        choose_query.append(row[0])
        choose_answer.append(row[1:])
    feedback_inf = feedback.get_feedback_inf(query, choose_query, choose_answer, rec_api, w2v, idf)

    # FV = RF+FF
    for i in range(len(api_feature)):
        sum = api_feature[i]
        sum.extend(feedback_inf[i])
        x.append(sum)

    # feature info of FR
    fr = open('../data/feedback_feature_rec.csv', 'r')
    reader = csv.reader(fr)
    y_feature, x_feautre, api_relevant_feature, rec_api_choose = [], [], [], []
    for row in reader:
        # y_feature.append(row[0])
        x_feautre.append(row[:-1])
        api_relevant_feature.append(row[1:3])
        rec_api_choose.append(row[-1])

    #feature info of SO
    fr = open('../data/get_feature_method.csv', 'r')
    reader = csv.reader(fr)
    unlabel_feature, rec_api_unlabel = [], []
    for row in reader:
        # y_feature.append(row[0])
        unlabel_feature.append(row[:-1])
        rec_api_unlabel.append(row[-1])


    # AL_choose_feature, AL_unlabel_feature = split_data.get_choose(AL_train_feature, choose)
    pred2, add_x_FR, add_x_FV, add_y_FV = get_AL_predict(x, x_feautre, unlabel_feature, query, choose_query, choose_answer, so_query, so_answer, rec_api, rec_api_choose, rec_api_unlabel, w2v, idf)

    pred1 = braid_LTR.get_LTR_predict(add_x_FR, add_x_FV, add_y_FV)

    rem = -10

    rec, rec_LTR, rec_AL = [], [], []
    sort, sort_LTR, sort_AL = [], [], []
    pred = []
    sum_pred1, sum_pred2 = 0, 0
    for i in range(len(api_feature)):
        sum_pred1 += pred1[i]+5
        sum_pred2 += pred2[i]
    al_idx = []
    rerank_al = sorted(pred2, reverse=True)
    for i in range(len(api_feature)):
        temp = rerank_al.index(pred2[i])+1
        while temp in al_idx:
            temp += 1
        al_idx.append(temp)

    m = 0.6
    for num in range(len(api_feature)):
        sum = (pred1[num]+5)/len(api_feature) + m*pred2[num]/al_idx[num]
        pred.append(sum)

    for i in range(len(api_feature)):
        sort.append(pred.index(max(pred)) + 1)
        sort_LTR.append(pred1.index(max(pred1)) + 1)
        sort_AL.append(pred2.index(max(pred2)) + 1)
        rec.append(max(pred))
        rec_LTR.append(max(pred1))
        rec_AL.append(max(pred2))
        pred[pred.index(max(pred))] = rem
        pred1[pred1.index(max(pred1))] = rem
        pred2[pred2.index(max(pred2))] = rem

    # 将api重新排序,输出相关结果
    
    for i in sort:
        api_mod = rec_api[i-1]
        print(sort.index(i) + 1, api_mod)
        api_obj = {'id':sort.index(i) + 1, 'api':api_mod, 'desc':api_dict_desc[api_mod] }
        rerank.append(api_mod)
        responseToClient.append(api_obj)

    print(type(rerank))
    print(json.dumps(rerank))
    print(rerank)
    print(responseToClient)
    return responseToClient
Пример #3
0
    test_query, test_answer, train_query, train_answer, test_feature, train_feature, rec_api_test, rec_api_train = split_data.get_test_train(
    )
    print('test_answer', test_answer)
    LTR_train_feature = get_LTR_feature(train_answer, rec_api_train,
                                        train_feature)
    num_choose = 373
    top1, top3, top5, map, mrr = 0, 0, 0, 0, 0
    # iteration begin
    for round in range(1):
        choose_query, choose_answer, rec_api_choose, unlabel_query, unlabel_answer, rec_api_unlabel, choose = split_data.split_choose_unlabel(
            train_query, train_answer, rec_api_train, num_choose)
        choose_feature, unlabel_feature = split_data.get_choose(
            LTR_train_feature, choose)

        train_feedback_info = feedback.get_feedback_inf(
            choose_query, choose_query, choose_answer, rec_api_choose, w2v,
            idf)
        test_feedback_info = feedback.get_feedback_inf(test_query,
                                                       choose_query,
                                                       choose_answer,
                                                       rec_api_test, w2v, idf)
        # print(11, len(train_feedback_info), len(LTR_train_feature), len(choose_feature), len(unlabel_feature))

        train_x_FV, train_y_FV = split_data.get_train_feature_matrix(
            train_feedback_info, choose_feature)
        test_feature = np.array(test_feature)

        y_predict = get_LTR_predict_LTR(test_feature, test_feedback_info,
                                        choose_feature, train_feedback_info)
        rank_mod, rankall = [], []
        for n in range(len(test_query)):
Пример #4
0
    # feedback info of user query from SO
    fr = open('../data/feedback_all.csv', 'r')
    reader = csv.reader(fr)
    so_query, so_answer = [], []
    for row in reader:
        so_query.append(row[0])
        so_answer.append(row[1:])

    # feedback info of user query from FR
    fr = open('../data/feedback_rec.csv', 'r')
    reader = csv.reader(fr)
    choose_query, choose_answer = [], []
    for row in reader:
        choose_query.append(row[0])
        choose_answer.append(row[1:])
    feedback_inf = feedback.get_feedback_inf(query, choose_query,
                                             choose_answer, rec_api, w2v, idf)

    # FV = RF+FF
    for i in range(len(api_feature)):
        sum = api_feature[i]
        sum.extend(feedback_inf[i])
        x.append(sum)

    # feature info of FR
    fr = open('../data/feedback_feature_rec.csv', 'r')
    reader = csv.reader(fr)
    y_feature, x_feautre, api_relevant_feature, rec_api_choose = [], [], [], []
    for row in reader:
        # y_feature.append(row[0])
        x_feautre.append(row[:-1])
        api_relevant_feature.append(row[1:3])
Пример #5
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 100
        sel_idx, sel_label = [], []
        for idx in range(n_queries):
            # query_idx, query_instance = learner.query(X=X_train)
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            # print(idx, len(X_train))
            # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx])
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])
            # learner.teach(
            #     X=new_X_train.reshape(1, -1),
            #     y=new_y_train.reshape(1, )
            # )
            # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(400):
        y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        predict.append(float(y_pre[0, 1]))
        # predict.append(math.log(float(y_pre[0, 1])+1))
        # predict.extend(y_pre.tolist())
        x = X_test[query_idx].reshape(1, -1)
    # print(predict)
    # print('new_choose', len(choose_query), len(choose_answer))
    # fw = open('../data/add_FR.csv', 'a+', newline='')
    # writer = csv.writer(fw)
    # for i, fr_q in enumerate(choose_query):
    #     writer.writerow((fr_q, choose_answer[i]))
    # fw.close()

    return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)