def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) length = len(rec_api_test) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 40 for idx in range(n_queries): query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(length): try: y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) except ValueError: predict = [0.0 for n in range(length)] else: predict.append(float(y_pre[0, 1])) return predict, X, new_X_feedback, new_y_feedback
def process_input(msg='how to convert int to string?'): global rerank global responseToClient global g_query_str global feedback_inf global api_feature global sort query = msg query_matrix, query_idf_vector = feedback.load_matrix(query, w2v, idf) top_questions = recommendation.get_topk_questions(query, query_matrix, query_idf_vector, questions, 50, parent) recommended_api = recommendation.recommend_api(query_matrix, query_idf_vector, top_questions, questions, javadoc,javadoc_dict_methods,-1) # recommended_api = recommendation.recommend_api_class(query_matrix, query_idf_vector, top_questions, questions, javadoc,javadoc_dict_classes,-1) # combine api_relevant feature with FF pos = -1 rec_api = [] api_dict_desc = {} x, api_feature, responseToClient = [], [], [] for i,api in enumerate(recommended_api): # print('Rank',i+1,':',api) rec_api.append(api) # recommendation.summarize_api_method(api,top_questions,questions,javadoc,javadoc_dict_methods) api_descriptions, questions_titles = recommendation.summarize_api_method(api, top_questions, questions, javadoc, javadoc_dict_methods) api_dict_desc[api] = api_descriptions sum_inf, api_inf, api_desc_inf = text2feat(api, api_descriptions, w2v, idf, query_matrix, query_idf_vector) api_feature.append(sum_inf) # print(api_feature) if i == 9: break # print('##################') start1 = time.time() # feedback info of user query from SO fr = open('../data/feedback_all.csv', 'r') reader = csv.reader(fr) so_query, so_answer = [], [] for row in reader: so_query.append(row[0]) so_answer.append(row[1:]) # feedback info of user query from FR fr = open('../data/feedback_rec.csv', 'r') reader = csv.reader(fr) choose_query, choose_answer = [], [] for row in reader: choose_query.append(row[0]) choose_answer.append(row[1:]) feedback_inf = feedback.get_feedback_inf(query, choose_query, choose_answer, rec_api, w2v, idf) # FV = RF+FF for i in range(len(api_feature)): sum = api_feature[i] sum.extend(feedback_inf[i]) x.append(sum) # feature info of FR fr = open('../data/feedback_feature_rec.csv', 'r') reader = csv.reader(fr) y_feature, x_feautre, api_relevant_feature, rec_api_choose = [], [], [], [] for row in reader: # y_feature.append(row[0]) x_feautre.append(row[:-1]) api_relevant_feature.append(row[1:3]) rec_api_choose.append(row[-1]) #feature info of SO fr = open('../data/get_feature_method.csv', 'r') reader = csv.reader(fr) unlabel_feature, rec_api_unlabel = [], [] for row in reader: # y_feature.append(row[0]) unlabel_feature.append(row[:-1]) rec_api_unlabel.append(row[-1]) # AL_choose_feature, AL_unlabel_feature = split_data.get_choose(AL_train_feature, choose) pred2, add_x_FR, add_x_FV, add_y_FV = get_AL_predict(x, x_feautre, unlabel_feature, query, choose_query, choose_answer, so_query, so_answer, rec_api, rec_api_choose, rec_api_unlabel, w2v, idf) pred1 = braid_LTR.get_LTR_predict(add_x_FR, add_x_FV, add_y_FV) rem = -10 rec, rec_LTR, rec_AL = [], [], [] sort, sort_LTR, sort_AL = [], [], [] pred = [] sum_pred1, sum_pred2 = 0, 0 for i in range(len(api_feature)): sum_pred1 += pred1[i]+5 sum_pred2 += pred2[i] al_idx = [] rerank_al = sorted(pred2, reverse=True) for i in range(len(api_feature)): temp = rerank_al.index(pred2[i])+1 while temp in al_idx: temp += 1 al_idx.append(temp) m = 0.6 for num in range(len(api_feature)): sum = (pred1[num]+5)/len(api_feature) + m*pred2[num]/al_idx[num] pred.append(sum) for i in range(len(api_feature)): sort.append(pred.index(max(pred)) + 1) sort_LTR.append(pred1.index(max(pred1)) + 1) sort_AL.append(pred2.index(max(pred2)) + 1) rec.append(max(pred)) rec_LTR.append(max(pred1)) rec_AL.append(max(pred2)) pred[pred.index(max(pred))] = rem pred1[pred1.index(max(pred1))] = rem pred2[pred2.index(max(pred2))] = rem # 将api重新排序,输出相关结果 for i in sort: api_mod = rec_api[i-1] print(sort.index(i) + 1, api_mod) api_obj = {'id':sort.index(i) + 1, 'api':api_mod, 'desc':api_dict_desc[api_mod] } rerank.append(api_mod) responseToClient.append(api_obj) print(type(rerank)) print(json.dumps(rerank)) print(rerank) print(responseToClient) return responseToClient
test_query, test_answer, train_query, train_answer, test_feature, train_feature, rec_api_test, rec_api_train = split_data.get_test_train( ) print('test_answer', test_answer) LTR_train_feature = get_LTR_feature(train_answer, rec_api_train, train_feature) num_choose = 373 top1, top3, top5, map, mrr = 0, 0, 0, 0, 0 # iteration begin for round in range(1): choose_query, choose_answer, rec_api_choose, unlabel_query, unlabel_answer, rec_api_unlabel, choose = split_data.split_choose_unlabel( train_query, train_answer, rec_api_train, num_choose) choose_feature, unlabel_feature = split_data.get_choose( LTR_train_feature, choose) train_feedback_info = feedback.get_feedback_inf( choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) test_feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) # print(11, len(train_feedback_info), len(LTR_train_feature), len(choose_feature), len(unlabel_feature)) train_x_FV, train_y_FV = split_data.get_train_feature_matrix( train_feedback_info, choose_feature) test_feature = np.array(test_feature) y_predict = get_LTR_predict_LTR(test_feature, test_feedback_info, choose_feature, train_feedback_info) rank_mod, rankall = [], [] for n in range(len(test_query)):
# feedback info of user query from SO fr = open('../data/feedback_all.csv', 'r') reader = csv.reader(fr) so_query, so_answer = [], [] for row in reader: so_query.append(row[0]) so_answer.append(row[1:]) # feedback info of user query from FR fr = open('../data/feedback_rec.csv', 'r') reader = csv.reader(fr) choose_query, choose_answer = [], [] for row in reader: choose_query.append(row[0]) choose_answer.append(row[1:]) feedback_inf = feedback.get_feedback_inf(query, choose_query, choose_answer, rec_api, w2v, idf) # FV = RF+FF for i in range(len(api_feature)): sum = api_feature[i] sum.extend(feedback_inf[i]) x.append(sum) # feature info of FR fr = open('../data/feedback_feature_rec.csv', 'r') reader = csv.reader(fr) y_feature, x_feautre, api_relevant_feature, rec_api_choose = [], [], [], [] for row in reader: # y_feature.append(row[0]) x_feautre.append(row[:-1]) api_relevant_feature.append(row[1:3])
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 100 sel_idx, sel_label = [], [] for idx in range(n_queries): # query_idx, query_instance = learner.query(X=X_train) query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) # print(idx, len(X_train)) # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx]) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # learner.teach( # X=new_X_train.reshape(1, -1), # y=new_y_train.reshape(1, ) # ) # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(400): y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) predict.append(float(y_pre[0, 1])) # predict.append(math.log(float(y_pre[0, 1])+1)) # predict.extend(y_pre.tolist()) x = X_test[query_idx].reshape(1, -1) # print(predict) # print('new_choose', len(choose_query), len(choose_answer)) # fw = open('../data/add_FR.csv', 'a+', newline='') # writer = csv.writer(fw) # for i, fr_q in enumerate(choose_query): # writer.writerow((fr_q, choose_answer[i])) # fw.close() return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)