def ncp_run(N1, N2, N3, gR, dR, time): # ncp test X = synthetic_data_cp([N1, N2, N3], gR, 0) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ncp_' + str(N1)) ncp = NCP_BCU(env) args = NCP_BCU.NCP_Args(rank=dR, validation_internal=200) ncp.build_model(args) print('\n\nNCP with %dx%dx%d, gR=%d, dR=%d, time=%d' % (N1, N2, N3, gR, dR, time)) loss_hist = ncp.train(6000) scale = str(N1) + '_' + str(gR) + '_' + str(dR) out_path = '/root/tensorD_f/data_out_tmp/python_out/ncp_' + scale + '_' + str(time) + '.txt' with open(out_path, 'w') as out: for loss in loss_hist: out.write('%.6f\n' % loss)
def cp_run(N1, N2, N3, gR, dR, time): # cp test X = synthetic_data_cp([N1, N2, N3], gR, 0) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/cp_' + str(N1)) cp = CP_ALS(env) args = CP_ALS.CP_Args(rank=dR, validation_internal=50, tol=1.0e-4) cp.build_model(args) print('CP with %dx%dx%d, gR=%d, dR=%d, time=%d' % (N1, N2, N3, gR, dR, time)) hist = cp.train(600) scale = str(N1) + '_' + str(gR) + '_' + str(dR) out_path = '/root/tensorD_f/data_out_tmp/python_out/cp_' + scale + '_' + str(time) + '.txt' with open(out_path, 'w') as out: for iter in hist: loss = iter[0] rel_res = iter[1] out.write('%.10f, %.10f\n' % (loss, rel_res))
def tucker_run(N1, N2, N3, gR, dR, time): # tucker X = synthetic_data_tucker([N1, N2, N3], [gR, gR, gR]) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/tucker_' + str(N1)) hooi = HOOI(env) args = HOOI.HOOI_Args(ranks=[dR, dR, dR], validation_internal=200) hooi.build_model(args) print('\n\nTucker with %dx%dx%d, gR=%d, dR=%d, time=%d' % (N1, N2, N3, gR, dR, time)) loss_hist = hooi.train(6000) scale = str(N1) + '_' + str(gR) + '_' + str(dR) out_path = '/root/tensorD_f/data_out_tmp/python_out/tucker_' + scale + '_' + str( time) + '.txt' with open(out_path, 'w') as out: for loss in loss_hist: out.write('%.6f\n' % loss)
def ntucker_run(N1, N2, N3, gR, dR, time): # ntucker X = synthetic_data_tucker([N1, N2, N3], [gR, gR, gR], 0) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ntucker_' + str(N1)) ntucker = NTUCKER_BCU(env) args = NTUCKER_BCU.NTUCKER_Args(ranks=[dR, dR, dR], validation_internal=500, tol=1.0e-4) ntucker.build_model(args) print('\n\nNTucker with %dx%dx%d, gR=%d, dR=%d, time=%d' % (N1, N2, N3, gR, dR, time)) loss_hist = ntucker.train(10000) scale = str(N1) + '_' + str(gR) + '_' + str(dR) out_path = '/root/tensorD_f/data_out_tmp/python_out/ntucker_' + scale + '_' + str( time) + '.txt' with open(out_path, 'w') as out: for loss in loss_hist: out.write('%.6f\n' % loss)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/10/4 PM8:41 # @Author : Shiloh Leung # @Site : # @File : ncp_demo.py # @Software: PyCharm Community Edition from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.ncp import NCP_BCU from tensorD.demo.DataGenerator import * if __name__ == '__main__': print('=========Train=========') X = synthetic_data_cp([30, 30, 30], 10) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ncp_demo_' + '30') ncp = NCP_BCU(env) args = NCP_BCU.NCP_Args(rank=10, validation_internal=1) ncp.build_model(args) ncp.train(100) factor_matrices = ncp.factors lambdas = ncp.lambdas print('Training ends.\n\n\n')
# @Time : 2018/1/17 PM4:30 # @Author : Shiloh Leung # @Site : # @File : ml_ncp.py # @Software: PyCharm Community Edition from tensorD.dataproc.reader import TensorReader import tensorflow as tf from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.ncp import NCP_BCU from tensorD.demo.DataGenerator import * if __name__ == '__main__': full_shape = [943, 1682, 31] base = TensorReader('/root/tensorD_f/data_out_tmp/u1.base.csv') base.read(full_shape=full_shape) with tf.Session() as sess: rating_tensor = sess.run(base.full_data) data_provider = Provider() data_provider.full_tensor = lambda: rating_tensor env = Environment(data_provider, summary_path='/tmp/ncp_ml') ncp = NCP_BCU(env) args = NCP_BCU.NCP_Args(rank=20, validation_internal=1) ncp.build_model(args) loss_hist = ncp.train(100) out_path = '/root/tensorD_f/data_out_tmp/python_out/ncp_ml_20.txt' with open(out_path, 'w') as out: for loss in loss_hist: out.write('%.6f\n' % loss)
def main(): w2v = gensim.models.Word2Vec.load( '../data/skip_w2v_model_stemmed') # pre-trained word embedding idf = pickle.load( open('../data/my_idf', 'rb')) # pre-trained idf value of all words in the w2v dictionary records = pickle.load(open("../data/records_final.pkl", 'rb')) print(len(records)) #获取需要推荐的问题 experiments = util.get_class_experiments() print(len(experiments)) csvfile_path = os.path.join(args.output_path, "topclass_expand11-10.csv") #输出结果 csvfile = open(csvfile_path, 'w', newline="") writer = csv.writer(csvfile) writer.writerow( ["question_title", "top5", "ground_truth_intersection", "true_apis"]) #所有问题的api的集合,看这个集合里面是否有答案存在 #统计能进行推荐的问题个数,推荐出来的问题的个数 recommend_num = 0 recommend_success_num = 0 processnum = 0 #统计指标 mrr = 0.0 map = 0.0 precision = 0 recall = 0 ndcg = 0.0 rec_num = args.rec_num start = time.clock() for experiment in experiments: experiment_method_annotation = experiment.method_annotation # print(experiment_method_annotation) experiment_now_method_flat = experiment.now_method_flat experiment_true_api = experiment.true_api experiment_now_api = experiment.now_api # 求差,取出交集 experiment_true_api = set(experiment_true_api) - set( experiment_now_api) query = experiment_method_annotation query_words = WordPunctTokenizer().tokenize(query.lower()) query_words = [ SnowballStemmer('english').stem(word) for word in query_words ] query_matrix = similarity.init_doc_matrix(query_words, w2v) query_idf_vector = similarity.init_doc_idf_vector(query_words, idf) #获取相似的TOP-N问题 top_questions = similarity.get_topk_questions(query_words, query_matrix, query_idf_vector, records, 11, 0.0) #获取得到问题的长度 # print(top_questions) similar_questions_length = len(top_questions) # print("similar_questions_length:",similar_questions_length) #查看现有问题是否在相似问题中,如果不在则加入,否则直接根据相似问题构建张量 flag = False similar_records_list = list(top_questions.keys()) for record in similar_records_list: if (record.title_words == query_words): flag = True processnum += 1 #现有问题在相似问题里面 record_method_annotation_words = list() record_method_flat = list() record_api = list() for record in similar_records_list: if record.title_words not in record_method_annotation_words: record_method_annotation_words.append(record.title_words) if record.method_block_flat not in record_method_flat: record_method_flat.append(record.method_block_flat) for api in record.method_api_sequence: if api not in record_api: record_api.append(api) #加入编程环境中出现的api for now_api in experiment_now_api: if now_api not in record_api: record_api.append(now_api) api_rec_all = [] if flag == True: recommend_num += 1 #构建张量 print(len(record_method_annotation_words), len(record_method_flat), len(record_api)) record_method_annotation_words_dict = dict( zip(range(len(record_method_annotation_words)), record_method_annotation_words)) record_method_flat_dict = dict( zip(range(len(record_method_flat)), record_method_flat)) record_api_dict = dict(zip(range(len(record_api)), record_api)) tensor = np.zeros((len(record_method_annotation_words), len(record_method_flat), len(record_api)), dtype=int) for record in similar_records_list: for concrete_api in record.method_api_sequence: tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(record.title_words)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(record.method_block_flat)], list(record_api_dict.keys( ))[list(record_api_dict.values()).index(concrete_api )]] = 1 for api in experiment_now_api: if api in record_api_dict.values(): tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(query_words)], :, list(record_api_dict.keys( ))[list(record_api_dict.values()).index(api)]] = 1 #处理不是张量的情况 one = query_words if len(record_api) == 0: continue if (len(record_method_annotation_words) == 1 or len(record_method_flat) == 1 or len(record_api) == 1): if (len(record_method_annotation_words) == 1 and len(record_method_flat) == 1 or len(record_method_flat) == 1 and len(record_api) == 1 or len(record_api) == 1 and len(record_method_annotation_words) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_api) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: if (len(record_method_annotation_words) == 1): matrix = tl.unfold(tensor, mode=1) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -matrix[list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values( )).index(two)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_method_flat) == 1): matrix = tl.unfold(tensor, mode=0) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) rec_combine_api_key = np.argsort(-matrix[ list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values( )).index(one)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: #张量分解 tf.reset_default_graph() tensor = tl.tensor(tensor).astype(np.float32) data_provider = Provider() data_provider.full_tensor = lambda: tensor env = Environment(data_provider, summary_path='/tensor/ncp_ml') ncp = NCP_BCU(env) arg = NCP_BCU.NCP_Args(rank=round( min(len(record_method_annotation_words), len(record_method_flat), len(record_api)) / 2), validation_internal=1) ncp.build_model(arg) loss_hist = ncp.train(100) factor_matrices = ncp.factors full_tensor = tl.kruskal_to_tensor(factor_matrices) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -full_tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(one)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(two)], :]).tolist() # 推荐的API列表,去除情境中已经含有的api api_rec_all = [record_api_dict[i] for i in rec_combine_api_key] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) #现有问题不在相似问题里面 else: similar_questions_length += 1 #去除找不到相似问题的问题 if similar_questions_length == 1: continue recommend_num += 1 #添加新来的query record_method_annotation_words.append(query_words) print(len(record_method_annotation_words), len(record_method_flat), len(record_api)) #构建张量 record_method_annotation_words_dict = dict( zip(range(len(record_method_annotation_words)), record_method_annotation_words)) record_method_flat_dict = dict( zip(range(len(record_method_flat)), record_method_flat)) record_api_dict = dict(zip(range(len(record_api)), record_api)) tensor = np.zeros((len(record_method_annotation_words), len(record_method_flat), len(record_api)), dtype=int) for record in similar_records_list: for concrete_api in record.method_api_sequence: tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(record.title_words)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(record.method_block_flat)], list(record_api_dict.keys( ))[list(record_api_dict.values()).index(concrete_api )]] = 1 for api in experiment_now_api: if api in record_api_dict.values(): tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(query_words)], :, list(record_api_dict.keys( ))[list(record_api_dict.values()).index(api)]] = 1 #处理不是张量分解 one = query_words if len(record_api) == 0: continue if (len(record_method_annotation_words) == 1 or len(record_method_flat) == 1 or len(record_api) == 1): if (len(record_method_annotation_words) == 1 and len(record_method_flat) == 1 or len(record_method_flat) == 1 and len(record_api) == 1 or len(record_api) == 1 and len(record_method_annotation_words) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_api) == 1): api_rec_all = record_api for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: if (len(record_method_annotation_words) == 1): matrix = tl.unfold(tensor, mode=1) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -matrix[list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values( )).index(two)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) elif (len(record_method_flat) == 1): matrix = tl.unfold(tensor, mode=0) nmf = nimfa.Nmf(matrix, max_iter=200, rank=round(min(matrix.shape) / 2), update='euclidean', objective='fro') nmf_fit = nmf() W = nmf_fit.basis() H = nmf_fit.coef() matrix = np.dot(W, H) rec_combine_api_key = np.argsort(-matrix[ list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values( )).index(one)], :]).tolist()[0] api_rec_all = [ record_api_dict[i] for i in rec_combine_api_key ] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) else: # 张量分解 tf.reset_default_graph() tensor = tl.tensor(tensor).astype(np.float32) data_provider = Provider() data_provider.full_tensor = lambda: tensor env = Environment(data_provider, summary_path='/tensor/ncp_ml') ncp = NCP_BCU(env) arg = NCP_BCU.NCP_Args(rank=round( min(len(record_method_annotation_words), len(record_method_flat), len(record_api)) / 2), validation_internal=1) ncp.build_model(arg) loss_hist = ncp.train(100) factor_matrices = ncp.factors full_tensor = tl.kruskal_to_tensor(factor_matrices) # one = query_words two = list( similarity.get_topk_method_flat( experiment_now_method_flat, list(record_method_flat_dict.values()), 1, 1, -1, 1).values())[0] rec_combine_api_key = np.argsort( -full_tensor[list(record_method_annotation_words_dict.keys( ))[list(record_method_annotation_words_dict.values()). index(one)], list(record_method_flat_dict.keys() )[list(record_method_flat_dict.values()). index(two)], :]).tolist() #推荐的API列表 api_rec_all = [record_api_dict[i] for i in rec_combine_api_key] for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) #判断结果在相似的问题中有没有出现 # print(experiment_true_api) # print('----------------------------------') experiment_true_api = [ true_api.split('.')[-2] for true_api in experiment_true_api ] experiment_true_api = removelist(experiment_true_api) experiment_now_api = [ true_api.split('.')[-2] for true_api in experiment_now_api ] experiment_now_api = removelist(experiment_now_api) #去除experiment_now_api experiment_true_api = set(experiment_true_api) - set( experiment_now_api) record_api = [true_api.split('.')[-2] for true_api in record_api] record_api = removelist(record_api) api_rec_all = [true_api.split('.')[-2] for true_api in api_rec_all] api_rec_all = removelist(api_rec_all) for m in set(experiment_now_api): if m in api_rec_all: api_rec_all.remove(m) api_rec = api_rec_all[:rec_num] pos = -1 tmp_map = 0.0 hits = 0.0 vector = list() for i, api in enumerate(api_rec_all[:rec_num]): if api in set(experiment_true_api) and pos == -1: pos = i + 1 if api in set(experiment_true_api): vector.append(1) hits += 1 tmp_map += hits / (i + 1) else: vector.append(0) tmp_map /= len(set(experiment_true_api)) tmp_mrr = 0.0 if pos != -1: tmp_mrr = 1.0 / pos map += tmp_map mrr += tmp_mrr ndcg += calculateNDCG.ndcg_at_k(vector[:rec_num], rec_num) ground_truth_intersection = set(api_rec).intersection( set(experiment_true_api)) if (len(ground_truth_intersection) > 0): recommend_success_num += 1 precision += len(ground_truth_intersection) / rec_num recall += len(ground_truth_intersection) / len( set(experiment_true_api)) writer.writerow([ experiment_method_annotation, api_rec, ground_truth_intersection, experiment_true_api ]) writer.writerow(["recommend_num", "recommend_success_num"]) writer.writerow([recommend_num, recommend_success_num]) writer.writerow([ "mrr/recommend_num", "recommend_num", "map/recommend_num", "success_rate@N", "precision@N/recommend_num", "recall@N/recommend_num", "ndcg/recommend_num" ]), writer.writerow([ mrr / recommend_num, recommend_num, map / recommend_num, recommend_success_num / recommend_num, precision / recommend_num, recall / recommend_num, ndcg / recommend_num ]) csvfile.close() end = time.clock() print('Running time: %s Seconds' % (end - start)) logging.info("Finish")
from tensorD.factorization.env import Environment from tensorD.factorization.pitf_numpy import PITF_np from tensorD.factorization.tucker import * from tensorD.dataproc.provider import Provider #from tensorD.dataproc.reader import TensorReader import tensorflow as tf import numpy as np if __name__ == '__main__': data_provider = Provider() data_provider.full_tensor = lambda: tf.constant( np.random.rand(50, 50, 8) * 10, dtype=tf.float32) pitf_np_env = Environment(data_provider, summary_path='/tmp/tensord') pitf_np = PITF_np(pitf_np_env) sess_t = pitf_np_env.sess init_op = tf.global_variables_initializer() sess_t.run(init_op) tensor = pitf_np_env.full_data().eval(session=sess_t) args = PITF_np.PITF_np_Args(rank=5, delt=0.8, tao=12, sample_num=100, validation_internal=1, verbose=False, steps=500) y, X_t, Y_t, Z_t, Ef_t, If_t, Rf_t = pitf_np.exact_recovery(args, tensor) y = tf.convert_to_tensor(y) X = tf.convert_to_tensor(X_t) Y = tf.convert_to_tensor(Y_t) Z = tf.convert_to_tensor(Z_t) Ef = tf.convert_to_tensor(Ef_t)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/10/3 PM4:16 # @Author : Shiloh Leung # @Site : # @File : ntucker_demo.py # @Software: PyCharm Community Edition import tensorflow as tf from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.ntucker import NTUCKER_BCU from tensorD.demo.DataGenerator import * if __name__ == '__main__': print('=========Train=========') X = synthetic_data_tucker([20, 20, 20], [10, 10, 10]) data_provider = Provider() data_provider.full_tensor = lambda: X env = Environment(data_provider, summary_path='/tmp/ntucker_demo') ntucker = NTUCKER_BCU(env) args = NTUCKER_BCU.NTUCKER_Args(ranks=[10, 10, 10], validation_internal=10) ntucker.build_model(args) ntucker.train(2000) factor_matrices = ntucker.factors core_tensor = ntucker.core print('Train ends.\n\n\n')
from tensorD.dataproc.reader import TensorReader from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.tucker import HOOI from tensorD.factorization.tucker import HOSVD from tensorD.demo.DataGenerator import * if __name__ == '__main__': full_shape = [943, 1682, 31] base = TensorReader('/root/tensorD_f/data_out_tmp/u1.base.csv') base.read(full_shape=full_shape) with tf.Session() as sess: rating_tensor = sess.run(base.full_data) data_provider = Provider() data_provider.full_tensor = lambda: rating_tensor env = Environment(data_provider, summary_path='/tmp/tucker_ml') hooi = HOOI(env) args = HOOI.HOOI_Args(ranks=[20, 20, 20], validation_internal=1) hooi.build_model(args) hist = hooi.train(100) out_path = '/root/tensorD_f/data_out_tmp/python_out/hooi_ml_20.txt' with open(out_path, 'w') as out: for iter in hist: loss = iter[0] rel_res = iter[1] out.write('%.10f, %.10f\n' % (loss, rel_res))
# @File : tucker_test.py # @Software: PyCharm Community Edition import numpy as np import tensorflow as tf from numpy.random import rand from tensorD.factorization.env import Environment from tensorD.dataproc.provider import Provider from tensorD.factorization.tucker import HOSVD from tensorD.factorization.tucker import HOOI if __name__ == '__main__': data_provider = Provider() X = np.arange(60).reshape(3, 4, 5) data_provider.full_tensor = lambda: X print('====HOSVD test====') hosvd_env = Environment(data_provider, summary_path='/tmp/tensord') hosvd = HOSVD(hosvd_env) args = HOSVD.HOSVD_Args(ranks=[2, 2, 2]) hosvd.build_model(args) hosvd.train() print(hosvd.full - X) print('\n\n\n====HOOI test====') hooi_env = Environment(data_provider, summary_path='/tmp/tensord') hooi = HOOI(hooi_env) args = hooi.HOOI_Args(ranks=[2, 2, 2], validation_internal=5) hooi.build_model(args) hooi.train(100) print(hooi.full - X)