def get_data(file_dir): session_item_file_path = file_dir + r"\session_item.txt" item_file_path = file_dir + r"\items.txt" # 获取session_item_data和user_session_data(此处data即为session_item_data) data, user_sessions_data = get_session_item_and_user_data( session_item_file_path) all_data_items = list() item_file = open(item_file_path, 'r') try: line = item_file.readline() tmp = line.split(',') for item_str in tmp: if item_str != '': item = int(item_str) all_data_items.append(item) except Exception as e: print(e) finally: item_file.close() # 获取item_session_data item_session_file_path = file_dir + r"\item_session.txt" if os.path.exists(item_session_file_path): item_session_data = rff.get_data_lists(item_session_file_path) else: # 获取item_session_data item_session_data = extract_item_data(data, all_data_items) # print("item_session_data: ", item_session_data) p2f.print_data_lists_to_file(item_session_data, item_session_file_path) print("finish get item session data") return user_sessions_data, data, item_session_data
def read_test(test_file_path, groundtruth_path): data_lists = list() label_list = list() test_dic_data = list() test_file = open(test_file_path) try: for line in test_file: line = line.strip('\n') if line.startswith('@'): continue else: cur_list = list() tmp = line.split(',') session = int(tmp[0]) item = int(tmp[1]) dic = dict() dic[session] = item test_dic_data.append(dic) for i in range(2, len(tmp) - 1): cur_list.append(float(tmp[i])) data_lists.append(cur_list) except Exception as e: print(e) finally: test_file.close() session_item_data = rff.get_data_lists(groundtruth_path) #存储session在session_item_data中的对应位置 session_idx_dic = dict() extract_session(session_item_data, session_idx_dic) extract_label(test_dic_data, session_item_data, session_idx_dic, label_list) return array(data_lists), array( label_list), test_dic_data, session_item_data, session_idx_dic
def data_selection(in_file_path, out_file_dir): out_data_file_path = out_file_dir + r'\session_item.txt' out_items_file_path = out_file_dir + r'\items.txt' data = rff.get_data_lists(in_file_path) selected_data = list() for cur_data in data: buy_items = cur_data[1] if len(buy_items) < 2: selected_data.append(cur_data) selected_items = extract_items(selected_data) p2f.print_data_lists_to_file(selected_data, out_data_file_path) p2f.print_list_to_file(selected_items, out_items_file_path)
def sample_patition(rate, origin_file_dir, sampling_file_dir): origin_file_path = origin_file_dir + r"\session_item.txt" if not os.path.exists(sampling_file_dir): os.makedirs(sampling_file_dir) data_write_path = sampling_file_dir + r"\session_item.txt" items_write_path = sampling_file_dir + r"\items.txt" # 读取完整数据 all_data = rff.get_data_lists(origin_file_path) # 进行采样 sample_data, sample_items = sample_partition_help(all_data, rate) # 输出采样数据 p2f.print_data_lists_to_file(sample_data, data_write_path) p2f.print_list_to_file(sample_items, items_write_path)
def manual_evaluate(): groundtruth_path = r'E:\recsyschallenge2015\mycode\ranking aggregation\classification\data\[email protected]\ranking aggregation\test\session_item.txt' solution_file = r'E:\recsyschallenge2015\mycode\result-data\solution.dat' session_item_data = rff.get_data_lists(groundtruth_path) session_idx_dic = dict() extract_session(session_item_data, session_idx_dic) solution = rff.get_solution(solution_file) # print(session_item_data) # print(solution) p1 = calc_precision_at_1(session_item_data, session_idx_dic, solution) p2 = calc_precision_at_2(session_item_data, session_idx_dic, solution) precision = calc_precision(session_item_data, session_idx_dic, solution) recall = calc_recall(session_item_data, session_idx_dic, solution) print('precision@1: ' + str(p1)) print('precision@2: ' + str(p2)) print('precision: ' + str(precision)) print('recall: ' + str(recall))
def dataFormat(data_path, write_file_path): data = rff.get_data_lists(data_path) rating_lists = list() buy_score = 1 unbuy_score = 0.5 session_set = set() item_set = set() #无序不重复集合 for cur_data in data: session = cur_data[0] session_set.add(session) buy_items = cur_data[1] unbuy_items = cur_data[2] for item in buy_items: item_set.add(item) rating_lists.append([session, item, buy_score]) for item in unbuy_items: item_set.add(item) rating_lists.append([session, item, unbuy_score]) print('当前数据session数目:', len(session_set)) print('当前数据全部item数目:', len(item_set)) print_rating_lists_to_file(rating_lists, write_file_path)
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir): train_items_file_path = out_file_dir + r'\items.txt' train_items = rff.get_int_list(train_items_file_path) test_data = rff.get_data_lists(in_test_file_path) test_data_selected = list() for cur_test_data in test_data: cur_items = cur_test_data[1] + cur_test_data[2] selection = True for item in cur_items: if item in train_items: continue else: selection = False break if selection: test_data_selected.append(cur_test_data) test_items_selected = extract_items(test_data_selected) out_test_data_file_path = out_test_file_dir + r'\session_item.txt' out_test_items_file_path = out_test_file_dir + r'\items.txt' p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path) p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
def classifier_test(): print("这是 GBRegression 回归方法") # setting # dataset_para = 'sampling@x@'+str(i)+'@partition' # 特征的选择:时间类特征:time; 新特征:new; 时间类特征+新特征: all feature = 'all' # 若用新特征,选择使用哪些特征 feature_para = (1, 2, 3, 4) # file directory # feature_dir = dataset_dir + r'\feature1' # ###预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取) # 输入1:(实验数据)dataset_dir\train\session_item.txt .\test\session_item.txt # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat .\yoochoose-buys.dat .\yoochoose-test.dat # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat .\yoochoose-buys-selected.dat .\yoochoose-test-selected.dat dataset_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\sampling@alldata@partition' #yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data' # 输出路径 #yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected' # 假如输出文件夹不存在,则创建文件夹 # if not os.path.exists(yoochoose_selected_dir): # os.makedirs(yoochoose_selected_dir) # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir) # ###提取特征 # 输入:yoochoose selected data(及groundtruth) # 提取特征结果所在文件夹 feature_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\sampling@alldata@partition\feature' # feature_dir = dataset_dir + r'\feature1' # 假如输出文件夹不存在,则创建文件夹 """ if not os.path.exists(feature_dir): os.makedirs(feature_dir) if feature == 'time': print('feature:', feature) Feature5.go(dataset_dir, feature_dir) elif feature == 'new': print('feature:', feature, 'feature_para:', feature_para) Feature4.go(dataset_dir, feature_dir, feature_para) else: print('feature:', feature) Feature6.go(dataset_dir, feature_dir) """ # 读取特征 # 训练文件路径 train_file_path = feature_dir + r'\click-buy-train.arff' # 测试文件路径 test_file_path = feature_dir + r'\click-buy-test-BR.txt' groundtruth_path = dataset_dir + r'\test\session_item.txt' X_train, y_train = Input2.read_train(train_file_path) X_test, y_test, test_dic_data = Input2.read_test(test_file_path, groundtruth_path) groundtruth_path = dataset_dir + r'\test\session_item.txt' session_item_data = rff.get_data_lists(groundtruth_path) # 模型训练 # ########## LR 这个方法 # print('model: LogisticRegressionClassifier') # model_LR = LogisticRegression() # LogisticRegression() # model_LR.fit(X_train, y_train) # # 取第二列,即类为1的分数 # score_LR = model_LR.predict_proba(X_test)[:, 1] # session_item_score_dic_LR = extract_score_by_session2(score_LR, test_dic_data) # ########## GB 这个方法 # print('model: GradientBoostingClassifier') # model_GB = GradientBoostingClassifier() # GradientBoostingClassifier() # model_GB.fit(X_train, y_train) # # 取第二列,即类为1的分数 # score_GB = model_GB.predict_proba(X_test)[:, 1] # session_item_score_dic_GB = extract_score_by_session2(score_GB, test_dic_data) # ########## LinearRegression这个方法 # print('model: LinearRegression') # model_LRegress = LinearRegression() # model_LRegress.fit(X_train, y_train) # # 取第二列,即类为1的分数 # score_LRegress = model_LRegress.predict(X_test) # session_item_score_dic_LRegress = extract_score_by_session2(score_LRegress, test_dic_data) ########## GBRegression这个方法 print('model: GBRegressor') model_GBRegressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') model_GBRegressor.fit(X_train, y_train) # 取第二列,即类为1的分数 score_GBRegressor = model_GBRegressor.predict(X_test) session_item_score_dic_GBRegressor = extract_score_by_session2( score_GBRegressor, test_dic_data) # Zero:实验结果存放路径 res_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\result_classifier®ression' if not os.path.exists(res_dir): os.makedirs(res_dir) init_flag = 0 if init_flag == 0: init_excel(res_dir) # 表示表格已经初始化一次了,不用再初始化了 init_flag = 1 # 将结果输出到文件中 res_file_path = res_dir + r'\GBRegression.csv' file = open(res_file_path, 'a', newline='') writer = csv.writer(file) data = list() # 开始计算precision和MRR for cur_data in session_item_data: precision4 = 0.0 MRR4 = 0.0 session = cur_data[0] n = len(cur_data[1]) cur_buy_items = cur_data[1] print("计算购买个数:", n) cur_item_prob = session_item_score_dic_GBRegressor[session] for i in range(n): if cur_item_prob[i][0] in cur_buy_items: precision4 += 1 / n for i in range(len(cur_item_prob)): if cur_item_prob[i][0] in cur_buy_items: MRR4 += 1.0 / (i + 1) break data = [ str('%.4f' % session), str('%.4f' % precision4), str('%.4f' % MRR4) ] writer.writerow(data) file.close()
cur_all_val.append(val) y_axis.append(cur_all_val) pyplot.boxplot(y_axis, labels=x_axis) pyplot.show() if __name__ == '__main__': # 这里箱图数据是整个原始数据集中的所有数据 main_dir = r'E:\ranking aggregation\dataset\yoochoose\Full1' # 提取数据文件。横坐标数据 print(1) sampling_para = 'extracted1' data_path = main_dir + '\\' + sampling_para + r"\session_item_xxxxxxxxxxxxxxxxx.txt" # items_path = main_dir + '\\' + sampling_para + r"\items_xxxxxxxxxxxxxxxxx.txt" data = rff.get_data_lists(data_path) # # all_items = rff.get_a_list(items_path) # 横坐标数据——购买商品数 buyNum_sessionList_dic = buyNum_sessionList_statistic(data) print(2) # 原始数据文件 click_file_path = main_dir + r"\yoochoose-clicks_xxxxxxxxxxxxxxxxx.dat" buys_file_path = main_dir + r"\yoochoose-buys_xxxxxxxxxxxxxxxxx.dat" # 纵坐标数据。每个session长度(不同item,不是按照click算) # session_len_dic = feature4.get_session_len(click_file_path) # 纵坐标数据——session持续时间 # session_lastTime_dic = feature5.get_session_lastTime(click_file_path) # 纵坐标数据——相似度——未完成 # item_category_dic = get_item_category(click_file_path) # session_simiList_dic = calc_similarity(data, item_category_dic)
def TestRLSO(): # # 模型训练输入例子 # # 用户、session数据——判断session属于哪个用户 # user_sessions_data = [[100, 101, 102],] # # 每个session购买的商品与点击不购买的商品,按照在数据集中出现的顺序放置 # session_item_data = [[100, [10, 11], [12, 13]], # [101, [11, 12], [10, 14]], # [102, [10, 13, 14], [11, ]]] # # 每个商品被哪些session购买以及被哪些session点击但不购买(item_session_data由session_item_data决定) # item_session_data = [[10, [100, 102], [101, ]], # [11, [100, 101], (102, ]], # [12, [101, ], [100, ]], # [13, [102, ], [100, ]], # [14, [102, ], [101, ]]] # # parameter:the number of aspects(That's K) # aspects_num = 5 # K aspects_num = 5 # \result\yoochoose\Full\D1_partition\sampling@x@2@partition\train中likelihood.txt,有0-199, # \result\yoochoose\Full\D2_partition\sampling@x@2@partition\train 中likelihood.txt,有0-149, # 模型训练迭代次数(大约迭代完成次数:D1_partition:200,D2_partition:150,D3_partition:100,D4_partition:100,D5_partition:100,D6_partition:50) ITERATION =50 # ??????? # 当前数据样本路径 main_dir = r"I:\Papers\consumer\codeandpaper" # 模型参数及实验结果输出路径 out_file_dir = r"I:\Papers\consumer\codeandpaper\code\result\yoochoose\Full" # 当前所有已设计的aggregate方法的数目(当改变了aggregate方法时才需设置) aggregate_num = 9 ##### aggregate方法的数目 ????? # 选择使用哪部分的数据集——取决于session中的购买商品数据 #part_para_list = ['D1_partition', 'D2_partition', 'D3_partition', 'D4_partition', 'D5_partition', 'D6_partition'] part_para_list = ['D1_partition'] # 最终实验数据集(Zero:使用的数据集,不是50个都用,每个【D1-D6】选了其中的10个) selection_index = [0, # 类似于数组的索引从0开始,但是我们不用它 [2,8,18,19,22,28,36,40,44,49], [3,7,12,18,21,35,37,44,46,49], [1,6,8,9,11,24,33,45,47,50], [3,4,5,18,20,25,29,39,45,49], [18,19,25,28,34,38,39,40,45,47], [5,6,9,12,24,28,33,39,40,49]] for part_para in part_para_list: # 实验结果输出表格初始化 init_flag = 0 # 当前数据集所属数据类型,决定了计算precision@N时N的大小 part_num = int(part_para[1]) # 当前数据集选择用于实验的数据编号 selection = selection_index[part_num] for i in selection: number = i print("part_para:", part_para, ", number:", number) dataset_para = "sampling@x" + '@' + str(number) + '@partition' dataset_dir = main_dir + r"\Full" + "\\" + part_para + "\\" + dataset_para #即为:\Full\D1_partition\sampling@x@1@partition # 原始yoochoose数据路径,Full文件夹里的D1,D2…… yoochoose_data_dir = main_dir + r"\Full" # 当前数据样本的训练数据路径, \Full\D1_partition\sampling@x@1@partition \train train_file_dir = dataset_dir + r"\train" # 当前数据样本的测试数据路径 test_data_dir = dataset_dir + r"\test" # 当前训练数据和测试数据的点击数据文件 yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected' if not os.path.exists(yoochoose_selected_dir): os.makedirs(yoochoose_selected_dir) # 计算训练数据的item ICR时用到 (ICR:商品购买转化率,就是购买该商品了的人除以看了该商品的人) # 此ICR 原始模型不用用到,策略用到的 click_file_path = yoochoose_selected_dir + r'\yoochoose-clicks-selected.dat' buy_file_path = yoochoose_selected_dir + r'\yoochoose-buys-selected.dat' # 模型参数路径 write_file_dir = out_file_dir + "\\" + part_para + "\\" + dataset_para + r"\train" if not os.path.exists(write_file_dir): os.makedirs(write_file_dir) # 实验结果路径 res_dir = out_file_dir + "\\" + part_para + r"\experiment result" if not os.path.exists(res_dir): os.makedirs(res_dir) # # 训练过程(若未有训练好的模型参数文件时,重新训练模型) # # 生成对应数据的点击数据文件(只需生成一次即会保存下来。若已生成,下次可强制关闭,以节省运行时间。) # print("注意,已强制关闭extract_yoochoose_selected_data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") # # print("extract_yoochoose_selected_data..") # # Preprocess11.extract_data(train_file_dir, test_data_dir, yoochoose_data_dir, yoochoose_selected_dir) # # print("finish extract_yoochoose_selected_data..") # # user_sessions_data, session_item_data, item_session_data = Input.get_data(train_file_dir) # print("finish getting data") # # 开始计时 # start = time.time() # U, V, theta, likelihood = RLSO5.go(user_sessions_data, session_item_data, item_session_data, aspects_num, ITERATION) # c = time.time() - start # print("程序运行总耗时:%0.2f" % c, 's') # # # 假如输出文件夹不存在,则创建文件夹 # if not os.path.exists(write_file_dir): # os.makedirs(write_file_dir) # print2file_list = [[theta], likelihood] # # 输出结果到文件中 # file_name = ["theta.txt", "likelihood.txt"] # idx = 0 # for cur_list in print2file_list: # cur_file_path = write_file_dir + "\\" + file_name[idx] # p2f.print_list_to_file(cur_list, cur_file_path) # idx += 1 # U_file_path = write_file_dir + "\\" + "U.txt" # p2f.print_list_dict_to_file(U, U_file_path) # V_file_path = write_file_dir + "\\" + "V.txt" # p2f.print_list_dict_to_file(V, V_file_path) # # 画图——似然迭代过程 # # pyplot.plot(range(len(likelihood)), likelihood) # # pyplot.show() # write_file_dir:模型参数路径 # (已经训练好模型)从文件中读取已经训练好的模型参数 theta_file_path = write_file_dir + "\\" + "theta.txt" [theta] = rff.get_float_list(theta_file_path) # rff :read from file U_file_path = write_file_dir + "\\" + "U.txt" U = rff.get_float_list_dict(U_file_path) V_file_path = write_file_dir + "\\" + "V.txt" V = rff.get_float_list_dict(V_file_path) # 测试过程 # 测试数据 # test data/groundtruth # test_data_dir = dataset_dir + r"\test" 当前数据样本的测试数据路径 # dataset_dir=I:\Papers\consumer\codeandpaper\Full\D1_partition\sampling@x@1@partition test_data_path = test_data_dir + r'\session_item.txt' session_item_data = rff.get_data_lists(test_data_path) # 测试数据的点击流数据(考虑商品的重复点击) test_click_stream_path = test_data_dir + r'\session_click_stream.txt' # 没有找到 “session_click_stream.txt”——后面,,,没有找到会生成。 # 测试数据的点击数据文件 test_file_path = yoochoose_selected_dir + r'\yoochoose-test-selected.dat' # 获取测试数据中各个session点击的item(item按点击顺序存放)(只考虑点击的不同商品,不考虑商品的重复点击——这是与session_click_stream的区别) dic, sessions, items_set = real_data.get_session_itemList(test_file_path) # 每个商品在各个session的出现次数(原静态特征,点击流场景下不会用到) item_session_times_dic = feature4.get_item_session_times(test_file_path) if os.path.exists(test_click_stream_path): session_click_stream = rff.get_int_list_dict(test_click_stream_path) else: session_click_stream = calcCorrelation.extract_click_stream(test_file_path) # res_path = res_dir + '\\' + dataset_para + '.txt' # 开始时初始化实验结果表格:输出行名、列名等信息 if init_flag == 0: init_excel(res_dir, aggregate_num) # 表示表格已经初始化一次了,不用再初始化了 init_flag = 1 # 非early predict部分的实验(各种种整合策略) # 非ealry predict就是我们的论文里面的实验,本来还想做个early predict的东西,但是效果不好,就放弃了 Recommendation22_aggregate.generate(click_file_path, buy_file_path, test_file_path, U, V, theta, aspects_num, session_item_data, dic, item_session_times_dic, session_click_stream, res_dir, part_num, aggregate_num) # (非early predict部分的实验(模型原始计算方法)) # 重复加载一遍下面的数据的原因是Recommendation22_aggregate.generate()方法貌似会对其函数参数造成一定改变,导致后面的程序运行时结果发生 # (续)一定的改变(已知知道会发生改变)。 # 测试过程 # 测试数据 # test data/groundtruth test_data_path = test_data_dir + r'\session_item.txt' session_item_data = rff.get_data_lists(test_data_path) # 测试数据的点击流数据(考虑商品的重复点击) test_click_stream_path = test_data_dir + r'\session_click_stream.txt' # 测试数据的点击数据文件 test_file_path = yoochoose_selected_dir + r'\yoochoose-test-selected.dat' # 获取测试数据中各个session点击的item(item按点击顺序存放)(只考虑点击的不同商品,不考虑商品的重复点击——这是与session_click_stream的区别) dic, sessions, items_set = real_data.get_session_itemList(test_file_path) # 每个商品在各个session的出现次数(原静态特征,点击流场景下不会用到) item_session_times_dic = feature4.get_item_session_times(test_file_path) if os.path.exists(test_click_stream_path): session_click_stream = rff.get_int_list_dict(test_click_stream_path) else: session_click_stream = calcCorrelation.extract_click_stream(test_file_path) # 非early predict部分的实验(原始方法) Recommendation11.generate(click_file_path, buy_file_path, test_file_path, U, V, theta, aspects_num, session_item_data, dic, item_session_times_dic, session_click_stream, res_dir, part_num)
def classifier_test(): #[5,6,9,12,24,28,33,39,40,49]# L = [18, 19, 25, 28, 34, 38, 39, 40, 45, 47] for i in L: """ D1: 2,8,18,19,22,28,36,40,44,49 D2: 3,7,12,18,21,35,37,44,46,49 D3: 1,6,8,9,11,24,33,45,47,50 D4: 3,4,5,18,20,25,29,39,45,49 """ #if not (i==3 or i==4 or i==5 or i==18 or i==20 or i==25 or i==29 or i==39 or i==45 or i==49): # continue print(i) # setting dataset_para = 'sampling@x@' + str(i) + '@partition' # 特征的选择:时间类特征:time; 新特征:new; 时间类特征+新特征: all feature = 'all' # 若用新特征,选择使用哪些特征 feature_para = (1, 2, 3, 4) # file directory # feature_dir = dataset_dir + r'\feature1' # ###预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取) # 输入1:(实验数据)dataset_dir\train\session_item.txt .\test\session_item.txt # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat .\yoochoose-buys.dat .\yoochoose-test.dat # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat .\yoochoose-buys-selected.dat .\yoochoose-test-selected.dat dataset_dir = r'F:\skyline recommendation\data4\D5_partition' + '\\' + dataset_para #yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data' # 输出路径 #yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected' # 假如输出文件夹不存在,则创建文件夹 # if not os.path.exists(yoochoose_selected_dir): # os.makedirs(yoochoose_selected_dir) # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir) # ###提取特征 # 输入:yoochoose selected data(及groundtruth) # 提取特征结果所在文件夹 feature_dir = r'F:\skyline recommendation\data4\D5_partition' + '\\' + dataset_para + '\\feature' # feature_dir = dataset_dir + r'\feature1' # 假如输出文件夹不存在,则创建文件夹 """ if not os.path.exists(feature_dir): os.makedirs(feature_dir) if feature == 'time': print('feature:', feature) Feature5.go(dataset_dir, feature_dir) elif feature == 'new': print('feature:', feature, 'feature_para:', feature_para) Feature4.go(dataset_dir, feature_dir, feature_para) else: print('feature:', feature) Feature6.go(dataset_dir, feature_dir) """ # 读取特征 # 训练文件路径 train_file_path = feature_dir + r'\click-buy-train.arff' # 测试文件路径 test_file_path = feature_dir + r'\click-buy-test-BR.txt' groundtruth_path = dataset_dir + r'\test\session_item.txt' X_train, y_train = Input2.read_train(train_file_path) X_test, y_test, test_dic_data = Input2.read_test( test_file_path, groundtruth_path) groundtruth_path = dataset_dir + r'\test\session_item.txt' session_item_data = rff.get_data_lists(groundtruth_path) # 模型训练 print('model: GradientBoostingClassifier') model = LogisticRegression() # LogisticRegression() model.fit(X_train, y_train) #取第二列,即类为1的分数 score = model.predict_proba(X_test)[:, 1] #y_predict = model.predict(X_test) # solution:根据预测结果,生成各个session其对应的购买商品 #solution = Solution.generate(test_dic_data, y_predict) session_item_score_dic = extract_score_by_session2( score, test_dic_data) #print("test**************************************") p, MRR = recommendation1.evaluate(session_item_data, session_item_score_dic) # p1 = calc_precision_at_1(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic) # p2 = calc_precision_at_2(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic) # MRR = calc_MRR(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic) # print('p1: ' + ('%.4f' % p1)) # print('p2: ' + ('%.4f' % p2)) # print('MRR: ' + ('%.4f' % MRR)) #print precision f = open("F:\\skyline recommendation\\data4\\D5_partition\\D5_MRR.csv", "a") writer = csv.writer(f) writer.writerow([i, ('%.4f' % p), ('%.4f' % MRR)]) f.close() # 以上,只初始化一次,后面再次跑代码,结果是追加在上一次的结果之上的, """