def ncp_run(N1, N2, N3, gR, dR, time): # ncp test X = synthetic_data_cp([N1, N2, N3], gR, 0) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ncp_' + str(N1)) ncp = NCP_BCU(env) args = NCP_BCU.NCP_Args(rank=dR, validation_internal=200) ncp.build_model(args) print('\n\nNCP with %dx%dx%d, gR=%d, dR=%d, time=%d' % (N1, N2, N3, gR, dR, time)) loss_hist = ncp.train(6000) scale = str(N1) + '_' + str(gR) + '_' + str(dR) out_path = '/root/tensorD_f/data_out_tmp/python_out/ncp_' + scale + '_' + str(time) + '.txt' with open(out_path, 'w') as out: for loss in loss_hist: out.write('%.6f\n' % loss)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/10/4 PM8:41 # @Author : Shiloh Leung # @Site : # @File : ncp_demo.py # @Software: PyCharm Community Edition from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.ncp import NCP_BCU from tensorD.demo.DataGenerator import * if __name__ == '__main__': print('=========Train=========') X = synthetic_data_cp([30, 30, 30], 10) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ncp_demo_' + '30') ncp = NCP_BCU(env) args = NCP_BCU.NCP_Args(rank=10, validation_internal=1) ncp.build_model(args) ncp.train(100) factor_matrices = ncp.factors lambdas = ncp.lambdas print('Training ends.\n\n\n')
def main(): w2v = gensim.models.Word2Vec.load( '../data/skip_w2v_model_stemmed') # pre-trained word embedding idf = pickle.load( open('../data/my_idf', 'rb')) # pre-trained idf value of all words in the w2v dictionary records = pickle.load(open("../data/records_final.pkl", 'rb')) print(len(records)) #获取需要推荐的问题 experiments = util.get_class_experiments() print(len(experiments)) csvfile_path = os.path.join(args.output_path, "topclass_expand11-10.csv") #输出结果 csvfile = open(csvfile_path, 'w', newline="") writer = csv.writer(csvfile) writer.writerow( ["question_title", "top5", "ground_truth_intersection", "true_apis"]) #所有问题的api的集合,看这个集合里面是否有答案存在 #统计能进行推荐的问题个数,推荐出来的问题的个数 recommend_num = 0 recommend_success_num = 0 processnum = 0 #统计指标 mrr = 0.0 map = 0.0 precision = 0 recall = 0 ndcg = 0.0 rec_num = args.rec_num start = time.clock() for experiment in experiments: experiment_method_annotation = experiment.method_annotation # print(experiment_method_annotation) experiment_now_method_flat = experiment.now_method_flat experiment_true_api = experiment.true_api experiment_now_api = experiment.now_api # 求差,取出交集 experiment_true_api = set(experiment_true_api) - set( experiment_now_api) query = experiment_method_annotation query_words = WordPunctTokenizer().tokenize(query.lower()) query_words = [ SnowballStemmer('english').stem(word) for word in query_words ] query_matrix = similarity.init_doc_matrix(query_words, w2v) query_idf_vector = similarity.init_doc_idf_vector(query_words, idf) #获取相似的TOP-N问题 top_questions = similarity.get_topk_questions(query_words, query_matrix, query_idf_vector, records, 11, 0.0) #获取得到问题的长度 # print(top_questions) similar_questions_length = len(top_questions) # print("similar_questions_length:",similar_questions_length) #查看现有问题是否在相似问题中,如果不在则加入,否则直接根据相似问题构建张量 flag = False similar_records_list = list(top_questions.keys()) for record in similar_records_list: if (record.title_words == query_words): flag = True processnum += 1 #现有问题在相似问题里面 record_method_annotation_words = list() record_method_flat = list() record_api = list() for record in similar_records_list: if record.title_words not in record_method_annotation_words: record_method_annotation_words.append(record.title_words) if record.method_block_flat not in record_method_flat: record_method_flat.append(record.method_block_flat) for api in record.method_api_sequence: if api not in record_api: record_api.append(api) #加入编程环境中出现的api for now_api in experiment_now_api: if now_api not in record_api: record_api.append(now_api) api_rec_all = [] if flag == True: recommend_num += 1 #构建张量 print(len(record_method_annotation_words), len(record_method_flat), len(record_api)) record_method_annotation_words_dict = dict( zip(range(len(record_method_annotation_words)), record_method_annotation_words)) record_method_flat_dict = dict( zip(range(len(record_method_flat)), record_method_flat)) record_api_dict = dict(zip(range(len(record_api)), record_api)) tensor = np.zeros((len(record_method_annotation_words), len(record_method_flat), len(record_api)), dtype=int) for record in similar_records_list: for concrete_api in record.method_api_sequence: tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(record.title_words)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(record.method_block_flat)], list(record_api_dict.keys( ))[list(record_api_dict.values()).index(concrete_api )]] = 1 for api in experiment_now_api: if api in record_api_dict.values(): tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(query_words)], :, list(record_api_dict.keys( ))[list(record_api_dict.values()).index(api)]] = 1 #处理不是张量的情况 one = query_words if len(record_api) == 0: continue if (len(record_method_annotation_words) == 1 or len(record_method_flat) == 1 or len(record_api) == 1): if (len(record_method_annotation_words) == 1 and len(record_method_flat) == 1 or len(record_method_flat) == 1 and len(record_api) == 1 or len(record_api) == 1 and len(record_method_annotation_words) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_api) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: if (len(record_method_annotation_words) == 1): matrix = tl.unfold(tensor, mode=1) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -matrix[list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values( )).index(two)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_method_flat) == 1): matrix = tl.unfold(tensor, mode=0) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) rec_combine_api_key = np.argsort(-matrix[ list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values( )).index(one)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: #张量分解 tf.reset_default_graph() tensor = tl.tensor(tensor).astype(np.float32) data_provider = Provider() data_provider.full_tensor = lambda: tensor env = Environment(data_provider, summary_path='/tensor/ncp_ml') ncp = NCP_BCU(env) arg = NCP_BCU.NCP_Args(rank=round( min(len(record_method_annotation_words), len(record_method_flat), len(record_api)) / 2), validation_internal=1) ncp.build_model(arg) loss_hist = ncp.train(100) factor_matrices = ncp.factors full_tensor = tl.kruskal_to_tensor(factor_matrices) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -full_tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(one)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(two)], :]).tolist() # 推荐的API列表,去除情境中已经含有的api api_rec_all = [record_api_dict[i] for i in rec_combine_api_key] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) #现有问题不在相似问题里面 else: similar_questions_length += 1 #去除找不到相似问题的问题 if similar_questions_length == 1: continue recommend_num += 1 #添加新来的query record_method_annotation_words.append(query_words) print(len(record_method_annotation_words), len(record_method_flat), len(record_api)) #构建张量 record_method_annotation_words_dict = dict( zip(range(len(record_method_annotation_words)), record_method_annotation_words)) record_method_flat_dict = dict( zip(range(len(record_method_flat)), record_method_flat)) record_api_dict = dict(zip(range(len(record_api)), record_api)) tensor = np.zeros((len(record_method_annotation_words), len(record_method_flat), len(record_api)), dtype=int) for record in similar_records_list: for concrete_api in record.method_api_sequence: tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(record.title_words)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(record.method_block_flat)], list(record_api_dict.keys( ))[list(record_api_dict.values()).index(concrete_api )]] = 1 for api in experiment_now_api: if api in record_api_dict.values(): tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(query_words)], :, list(record_api_dict.keys( ))[list(record_api_dict.values()).index(api)]] = 1 #处理不是张量分解 one = query_words if len(record_api) == 0: continue if (len(record_method_annotation_words) == 1 or len(record_method_flat) == 1 or len(record_api) == 1): if (len(record_method_annotation_words) == 1 and len(record_method_flat) == 1 or len(record_method_flat) == 1 and len(record_api) == 1 or len(record_api) == 1 and len(record_method_annotation_words) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_api) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: if (len(record_method_annotation_words) == 1): matrix = tl.unfold(tensor, mode=1) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -matrix[list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values( )).index(two)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_method_flat) == 1): matrix = tl.unfold(tensor, mode=0) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) rec_combine_api_key = np.argsort(-matrix[ list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values( )).index(one)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: # 张量分解 tf.reset_default_graph() tensor = tl.tensor(tensor).astype(np.float32) data_provider = Provider() data_provider.full_tensor = lambda: tensor env = Environment(data_provider, summary_path='/tensor/ncp_ml') ncp = NCP_BCU(env) arg = NCP_BCU.NCP_Args(rank=round( min(len(record_method_annotation_words), len(record_method_flat), len(record_api)) / 2), validation_internal=1) ncp.build_model(arg) loss_hist = ncp.train(100) factor_matrices = ncp.factors full_tensor = tl.kruskal_to_tensor(factor_matrices) # one = query_words two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -full_tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(one)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(two)], :]).tolist() #推荐的API列表 api_rec_all = [record_api_dict[i] for i in rec_combine_api_key] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) #判断结果在相似的问题中有没有出现 # print(experiment_true_api) # print('----------------------------------') experiment_true_api = [ true_api.split('.')[-2] for true_api in experiment_true_api ] experiment_true_api = removelist(experiment_true_api) experiment_now_api = [ true_api.split('.')[-2] for true_api in experiment_now_api ] experiment_now_api = removelist(experiment_now_api) #去除experiment_now_api experiment_true_api = set(experiment_true_api) - set( experiment_now_api) record_api = [true_api.split('.')[-2] for true_api in record_api] record_api = removelist(record_api) api_rec_all = [true_api.split('.')[-2] for true_api in api_rec_all] api_rec_all = removelist(api_rec_all) for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) api_rec = api_rec_all[:rec_num] pos = -1 tmp_map = 0.0 hits = 0.0 vector = list() for i, api in enumerate(api_rec_all[:rec_num]): if api in set(experiment_true_api) and pos == -1: pos = i + 1 if api in set(experiment_true_api): vector.append(1) hits += 1 tmp_map += hits / (i + 1) else: vector.append(0) tmp_map /= len(set(experiment_true_api)) tmp_mrr = 0.0 if pos != -1: tmp_mrr = 1.0 / pos map += tmp_map mrr += tmp_mrr ndcg += calculateNDCG.ndcg_at_k(vector[:rec_num], rec_num) ground_truth_intersection = set(api_rec).intersection( set(experiment_true_api)) if (len(ground_truth_intersection) > 0): recommend_success_num += 1 precision += len(ground_truth_intersection) / rec_num recall += len(ground_truth_intersection) / len( set(experiment_true_api)) writer.writerow([ experiment_method_annotation, api_rec, ground_truth_intersection, experiment_true_api ]) writer.writerow(["recommend_num", "recommend_success_num"]) writer.writerow([recommend_num, recommend_success_num]) writer.writerow([ "mrr/recommend_num", "recommend_num", "map/recommend_num", "success_rate@N", "precision@N/recommend_num", "recall@N/recommend_num", "ndcg/recommend_num" ]), writer.writerow([ mrr / recommend_num, recommend_num, map / recommend_num, recommend_success_num / recommend_num, precision / recommend_num, recall / recommend_num, ndcg / recommend_num ]) csvfile.close() end = time.clock() print('Running time: %s Seconds' % (end - start)) logging.info("Finish")