Пример #1
0
    def get_data(file_dir):

        session_item_file_path = file_dir + r"\session_item.txt"
        item_file_path = file_dir + r"\items.txt"
        # 获取session_item_data和user_session_data(此处data即为session_item_data)
        data, user_sessions_data = get_session_item_and_user_data(
            session_item_file_path)
        all_data_items = list()
        item_file = open(item_file_path, 'r')
        try:
            line = item_file.readline()
            tmp = line.split(',')
            for item_str in tmp:
                if item_str != '':
                    item = int(item_str)
                    all_data_items.append(item)
        except Exception as e:
            print(e)
        finally:
            item_file.close()
        # 获取item_session_data
        item_session_file_path = file_dir + r"\item_session.txt"
        if os.path.exists(item_session_file_path):
            item_session_data = rff.get_data_lists(item_session_file_path)
        else:
            # 获取item_session_data
            item_session_data = extract_item_data(data, all_data_items)
            # print("item_session_data: ", item_session_data)
            p2f.print_data_lists_to_file(item_session_data,
                                         item_session_file_path)
        print("finish get item session data")
        return user_sessions_data, data, item_session_data
Пример #2
0
 def read_test(test_file_path, groundtruth_path):
     data_lists = list()
     label_list = list()
     test_dic_data = list()
     test_file = open(test_file_path)
     try:
         for line in test_file:
             line = line.strip('\n')
             if line.startswith('@'):
                 continue
             else:
                 cur_list = list()
                 tmp = line.split(',')
                 session = int(tmp[0])
                 item = int(tmp[1])
                 dic = dict()
                 dic[session] = item
                 test_dic_data.append(dic)
                 for i in range(2, len(tmp) - 1):
                     cur_list.append(float(tmp[i]))
                 data_lists.append(cur_list)
     except Exception as e:
         print(e)
     finally:
         test_file.close()
     session_item_data = rff.get_data_lists(groundtruth_path)
     #存储session在session_item_data中的对应位置
     session_idx_dic = dict()
     extract_session(session_item_data, session_idx_dic)
     extract_label(test_dic_data, session_item_data, session_idx_dic,
                   label_list)
     return array(data_lists), array(
         label_list), test_dic_data, session_item_data, session_idx_dic
Пример #3
0
def data_selection(in_file_path, out_file_dir):
    out_data_file_path = out_file_dir + r'\session_item.txt'
    out_items_file_path = out_file_dir + r'\items.txt'
    data = rff.get_data_lists(in_file_path)
    selected_data = list()
    for cur_data in data:
        buy_items = cur_data[1]
        if len(buy_items) < 2:
            selected_data.append(cur_data)
    selected_items = extract_items(selected_data)
    p2f.print_data_lists_to_file(selected_data, out_data_file_path)
    p2f.print_list_to_file(selected_items, out_items_file_path)
Пример #4
0
def sample_patition(rate, origin_file_dir, sampling_file_dir):

    origin_file_path = origin_file_dir + r"\session_item.txt"

    if not os.path.exists(sampling_file_dir):
        os.makedirs(sampling_file_dir)
    data_write_path = sampling_file_dir + r"\session_item.txt"
    items_write_path = sampling_file_dir + r"\items.txt"

    # 读取完整数据
    all_data = rff.get_data_lists(origin_file_path)
    # 进行采样
    sample_data, sample_items = sample_partition_help(all_data, rate)
    # 输出采样数据
    p2f.print_data_lists_to_file(sample_data, data_write_path)
    p2f.print_list_to_file(sample_items, items_write_path)
Пример #5
0
def manual_evaluate():
    groundtruth_path = r'E:\recsyschallenge2015\mycode\ranking aggregation\classification\data\[email protected]\ranking aggregation\test\session_item.txt'
    solution_file = r'E:\recsyschallenge2015\mycode\result-data\solution.dat'

    session_item_data = rff.get_data_lists(groundtruth_path)
    session_idx_dic = dict()
    extract_session(session_item_data, session_idx_dic)
    solution = rff.get_solution(solution_file)
    # print(session_item_data)
    # print(solution)
    p1 = calc_precision_at_1(session_item_data, session_idx_dic, solution)
    p2 = calc_precision_at_2(session_item_data, session_idx_dic, solution)
    precision = calc_precision(session_item_data, session_idx_dic, solution)
    recall = calc_recall(session_item_data, session_idx_dic, solution)
    print('precision@1: ' + str(p1))
    print('precision@2: ' + str(p2))
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
Пример #6
0
def dataFormat(data_path, write_file_path):
    data = rff.get_data_lists(data_path)
    rating_lists = list()
    buy_score = 1
    unbuy_score = 0.5
    session_set = set()
    item_set = set()  #无序不重复集合
    for cur_data in data:
        session = cur_data[0]
        session_set.add(session)
        buy_items = cur_data[1]
        unbuy_items = cur_data[2]
        for item in buy_items:
            item_set.add(item)
            rating_lists.append([session, item, buy_score])
        for item in unbuy_items:
            item_set.add(item)
            rating_lists.append([session, item, unbuy_score])
    print('当前数据session数目:', len(session_set))
    print('当前数据全部item数目:', len(item_set))
    print_rating_lists_to_file(rating_lists, write_file_path)
Пример #7
0
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir):
    train_items_file_path = out_file_dir + r'\items.txt'
    train_items = rff.get_int_list(train_items_file_path)
    test_data = rff.get_data_lists(in_test_file_path)
    test_data_selected = list()

    for cur_test_data in test_data:
        cur_items = cur_test_data[1] + cur_test_data[2]
        selection = True
        for item in cur_items:
            if item in train_items:
                continue
            else:
                selection = False
                break
        if selection:
            test_data_selected.append(cur_test_data)
    test_items_selected = extract_items(test_data_selected)
    out_test_data_file_path = out_test_file_dir + r'\session_item.txt'
    out_test_items_file_path = out_test_file_dir + r'\items.txt'
    p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path)
    p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
Пример #8
0
def classifier_test():

    print("这是 GBRegression 回归方法")
    # setting
    # dataset_para = 'sampling@x@'+str(i)+'@partition'
    # 特征的选择:时间类特征:time; 新特征:new;  时间类特征+新特征: all
    feature = 'all'

    # 若用新特征,选择使用哪些特征
    feature_para = (1, 2, 3, 4)

    # file directory
    # feature_dir = dataset_dir + r'\feature1'

    # ###预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取)
    # 输入1:(实验数据)dataset_dir\train\session_item.txt  .\test\session_item.txt
    # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat  .\yoochoose-buys.dat  .\yoochoose-test.dat
    # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat  .\yoochoose-buys-selected.dat  .\yoochoose-test-selected.dat
    dataset_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\sampling@alldata@partition'
    #yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data'
    # 输出路径
    #yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected'
    # 假如输出文件夹不存在,则创建文件夹
    # if not os.path.exists(yoochoose_selected_dir):
    #     os.makedirs(yoochoose_selected_dir)
    # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir)

    # ###提取特征
    # 输入:yoochoose selected data(及groundtruth)
    # 提取特征结果所在文件夹

    feature_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\sampling@alldata@partition\feature'
    # feature_dir = dataset_dir + r'\feature1'
    # 假如输出文件夹不存在,则创建文件夹
    """
    if not os.path.exists(feature_dir):
        os.makedirs(feature_dir)
    if feature == 'time':
        print('feature:', feature)
        Feature5.go(dataset_dir, feature_dir)
    elif feature == 'new':
        print('feature:', feature, 'feature_para:', feature_para)
        Feature4.go(dataset_dir, feature_dir, feature_para)
    else:
        print('feature:', feature)
        Feature6.go(dataset_dir, feature_dir)
    """
    # 读取特征
    # 训练文件路径
    train_file_path = feature_dir + r'\click-buy-train.arff'
    # 测试文件路径
    test_file_path = feature_dir + r'\click-buy-test-BR.txt'
    groundtruth_path = dataset_dir + r'\test\session_item.txt'

    X_train, y_train = Input2.read_train(train_file_path)
    X_test, y_test, test_dic_data = Input2.read_test(test_file_path,
                                                     groundtruth_path)

    groundtruth_path = dataset_dir + r'\test\session_item.txt'
    session_item_data = rff.get_data_lists(groundtruth_path)

    # 模型训练

    # ########## LR 这个方法
    # print('model: LogisticRegressionClassifier')
    # model_LR = LogisticRegression()  # LogisticRegression()
    # model_LR.fit(X_train, y_train)
    # # 取第二列,即类为1的分数
    # score_LR = model_LR.predict_proba(X_test)[:, 1]
    # session_item_score_dic_LR = extract_score_by_session2(score_LR, test_dic_data)

    # ########## GB 这个方法
    # print('model: GradientBoostingClassifier')
    # model_GB = GradientBoostingClassifier()  # GradientBoostingClassifier()
    # model_GB.fit(X_train, y_train)
    # # 取第二列,即类为1的分数
    # score_GB = model_GB.predict_proba(X_test)[:, 1]
    # session_item_score_dic_GB = extract_score_by_session2(score_GB, test_dic_data)

    # ########## LinearRegression这个方法
    # print('model: LinearRegression')
    # model_LRegress = LinearRegression()
    # model_LRegress.fit(X_train, y_train)
    # # 取第二列,即类为1的分数
    # score_LRegress = model_LRegress.predict(X_test)
    # session_item_score_dic_LRegress = extract_score_by_session2(score_LRegress, test_dic_data)

    ########## GBRegression这个方法
    print('model: GBRegressor')
    model_GBRegressor = GradientBoostingRegressor(n_estimators=100,
                                                  learning_rate=0.1,
                                                  max_depth=1,
                                                  random_state=0,
                                                  loss='ls')
    model_GBRegressor.fit(X_train, y_train)
    # 取第二列,即类为1的分数
    score_GBRegressor = model_GBRegressor.predict(X_test)
    session_item_score_dic_GBRegressor = extract_score_by_session2(
        score_GBRegressor, test_dic_data)

    # Zero:实验结果存放路径
    res_dir = r'I:\Papers\consumer\codeandpaper\PreprocessData\alldata\result_classifier&regression'
    if not os.path.exists(res_dir):
        os.makedirs(res_dir)

    init_flag = 0
    if init_flag == 0:
        init_excel(res_dir)
        # 表示表格已经初始化一次了,不用再初始化了
        init_flag = 1

    # 将结果输出到文件中
    res_file_path = res_dir + r'\GBRegression.csv'
    file = open(res_file_path, 'a', newline='')
    writer = csv.writer(file)
    data = list()

    # 开始计算precision和MRR
    for cur_data in session_item_data:

        precision4 = 0.0
        MRR4 = 0.0

        session = cur_data[0]
        n = len(cur_data[1])
        cur_buy_items = cur_data[1]
        print("计算购买个数:", n)

        cur_item_prob = session_item_score_dic_GBRegressor[session]
        for i in range(n):
            if cur_item_prob[i][0] in cur_buy_items:
                precision4 += 1 / n
        for i in range(len(cur_item_prob)):
            if cur_item_prob[i][0] in cur_buy_items:
                MRR4 += 1.0 / (i + 1)
                break

        data = [
            str('%.4f' % session),
            str('%.4f' % precision4),
            str('%.4f' % MRR4)
        ]
        writer.writerow(data)

    file.close()
Пример #9
0
            cur_all_val.append(val)
        y_axis.append(cur_all_val)
    pyplot.boxplot(y_axis, labels=x_axis)
    pyplot.show()


if __name__ == '__main__':
    # 这里箱图数据是整个原始数据集中的所有数据
    main_dir = r'E:\ranking aggregation\dataset\yoochoose\Full1'

    # 提取数据文件。横坐标数据
    print(1)
    sampling_para = 'extracted1'
    data_path = main_dir + '\\' + sampling_para + r"\session_item_xxxxxxxxxxxxxxxxx.txt"
    # items_path = main_dir + '\\' + sampling_para + r"\items_xxxxxxxxxxxxxxxxx.txt"
    data = rff.get_data_lists(data_path)
    # # all_items = rff.get_a_list(items_path)
    # 横坐标数据——购买商品数
    buyNum_sessionList_dic = buyNum_sessionList_statistic(data)

    print(2)
    # 原始数据文件
    click_file_path = main_dir + r"\yoochoose-clicks_xxxxxxxxxxxxxxxxx.dat"
    buys_file_path = main_dir + r"\yoochoose-buys_xxxxxxxxxxxxxxxxx.dat"
    # 纵坐标数据。每个session长度(不同item,不是按照click算)
    # session_len_dic = feature4.get_session_len(click_file_path)
    # 纵坐标数据——session持续时间
    # session_lastTime_dic = feature5.get_session_lastTime(click_file_path)
    # 纵坐标数据——相似度——未完成
    # item_category_dic = get_item_category(click_file_path)
    # session_simiList_dic = calc_similarity(data, item_category_dic)
Пример #10
0
def TestRLSO():

    # # 模型训练输入例子
    # # 用户、session数据——判断session属于哪个用户
    # user_sessions_data = [[100, 101, 102],]
    # # 每个session购买的商品与点击不购买的商品,按照在数据集中出现的顺序放置
    # session_item_data = [[100, [10, 11], [12, 13]],
    #                      [101, [11, 12], [10, 14]],
    #                      [102, [10, 13, 14], [11, ]]]
    # # 每个商品被哪些session购买以及被哪些session点击但不购买(item_session_data由session_item_data决定)
    # item_session_data = [[10, [100, 102], [101, ]],
    #                      [11, [100, 101], (102, ]],
    #                      [12, [101, ], [100, ]],
    #                      [13, [102, ], [100, ]],
    #                      [14, [102, ], [101, ]]]
    # # parameter:the number of aspects(That's K)
    # aspects_num = 5

    # K
    aspects_num = 5

    # \result\yoochoose\Full\D1_partition\sampling@x@2@partition\train中likelihood.txt,有0-199,
    # \result\yoochoose\Full\D2_partition\sampling@x@2@partition\train 中likelihood.txt,有0-149,
    # 模型训练迭代次数(大约迭代完成次数:D1_partition:200,D2_partition:150,D3_partition:100,D4_partition:100,D5_partition:100,D6_partition:50)
    ITERATION =50 # ???????

    # 当前数据样本路径
    main_dir = r"I:\Papers\consumer\codeandpaper"

    # 模型参数及实验结果输出路径
    out_file_dir = r"I:\Papers\consumer\codeandpaper\code\result\yoochoose\Full"

    # 当前所有已设计的aggregate方法的数目(当改变了aggregate方法时才需设置)
    aggregate_num = 9    ##### aggregate方法的数目 ?????

    # 选择使用哪部分的数据集——取决于session中的购买商品数据
    #part_para_list = ['D1_partition', 'D2_partition', 'D3_partition', 'D4_partition', 'D5_partition', 'D6_partition']
    part_para_list = ['D1_partition']
    # 最终实验数据集(Zero:使用的数据集,不是50个都用,每个【D1-D6】选了其中的10个)
    selection_index = [0, # 类似于数组的索引从0开始,但是我们不用它
                       [2,8,18,19,22,28,36,40,44,49],
                       [3,7,12,18,21,35,37,44,46,49],
                       [1,6,8,9,11,24,33,45,47,50],
                       [3,4,5,18,20,25,29,39,45,49],
                       [18,19,25,28,34,38,39,40,45,47],
                       [5,6,9,12,24,28,33,39,40,49]]

    for part_para in part_para_list:
        # 实验结果输出表格初始化
        init_flag = 0

        # 当前数据集所属数据类型,决定了计算precision@N时N的大小
        part_num = int(part_para[1])
        # 当前数据集选择用于实验的数据编号
        selection = selection_index[part_num]

        for i in selection:
            number = i
            print("part_para:", part_para, ",   number:", number)
            dataset_para = "sampling@x" + '@' + str(number) + '@partition'
            dataset_dir = main_dir + r"\Full" + "\\" + part_para + "\\" + dataset_para
            #即为:\Full\D1_partition\sampling@x@1@partition

            # 原始yoochoose数据路径,Full文件夹里的D1,D2……
            yoochoose_data_dir = main_dir + r"\Full"
            # 当前数据样本的训练数据路径, \Full\D1_partition\sampling@x@1@partition \train
            train_file_dir = dataset_dir + r"\train"
            # 当前数据样本的测试数据路径
            test_data_dir = dataset_dir + r"\test"
            # 当前训练数据和测试数据的点击数据文件
            yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected'
            if not os.path.exists(yoochoose_selected_dir):
                os.makedirs(yoochoose_selected_dir)

            # 计算训练数据的item ICR时用到 (ICR:商品购买转化率,就是购买该商品了的人除以看了该商品的人)
            # 此ICR 原始模型不用用到,策略用到的

            click_file_path = yoochoose_selected_dir + r'\yoochoose-clicks-selected.dat'
            buy_file_path = yoochoose_selected_dir + r'\yoochoose-buys-selected.dat'

            # 模型参数路径
            write_file_dir = out_file_dir + "\\" + part_para + "\\" + dataset_para + r"\train"
            if not os.path.exists(write_file_dir):
                os.makedirs(write_file_dir)

            # 实验结果路径
            res_dir = out_file_dir + "\\" + part_para + r"\experiment result"
            if not os.path.exists(res_dir):
                os.makedirs(res_dir)

            # # 训练过程(若未有训练好的模型参数文件时,重新训练模型)
            # # 生成对应数据的点击数据文件(只需生成一次即会保存下来。若已生成,下次可强制关闭,以节省运行时间。)
            # print("注意,已强制关闭extract_yoochoose_selected_data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            # # print("extract_yoochoose_selected_data..")
            # # Preprocess11.extract_data(train_file_dir, test_data_dir, yoochoose_data_dir, yoochoose_selected_dir)
            # # print("finish extract_yoochoose_selected_data..")
            #
            # user_sessions_data, session_item_data, item_session_data = Input.get_data(train_file_dir)
            # print("finish getting data")
            # # 开始计时
            # start = time.time()
            # U, V, theta, likelihood = RLSO5.go(user_sessions_data, session_item_data, item_session_data, aspects_num, ITERATION)
            # c = time.time() - start
            # print("程序运行总耗时:%0.2f" % c, 's')
            #
            # # 假如输出文件夹不存在,则创建文件夹
            # if not os.path.exists(write_file_dir):
            #     os.makedirs(write_file_dir)
            # print2file_list = [[theta], likelihood]
            # # 输出结果到文件中
            # file_name = ["theta.txt", "likelihood.txt"]
            # idx = 0
            # for cur_list in print2file_list:
            #     cur_file_path = write_file_dir + "\\" + file_name[idx]
            #     p2f.print_list_to_file(cur_list, cur_file_path)
            #     idx += 1
            # U_file_path = write_file_dir + "\\" + "U.txt"
            # p2f.print_list_dict_to_file(U, U_file_path)
            # V_file_path = write_file_dir + "\\" + "V.txt"
            # p2f.print_list_dict_to_file(V, V_file_path)
            # # 画图——似然迭代过程
            # # pyplot.plot(range(len(likelihood)), likelihood)
            # # pyplot.show()


            #  write_file_dir:模型参数路径

            # (已经训练好模型)从文件中读取已经训练好的模型参数
            theta_file_path = write_file_dir + "\\" + "theta.txt"
            [theta] = rff.get_float_list(theta_file_path)   # rff :read from file
            U_file_path = write_file_dir + "\\" + "U.txt"
            U = rff.get_float_list_dict(U_file_path)
            V_file_path = write_file_dir + "\\" + "V.txt"
            V = rff.get_float_list_dict(V_file_path)

            # 测试过程
            # 测试数据
            # test data/groundtruth

            #  test_data_dir = dataset_dir + r"\test" 当前数据样本的测试数据路径
            #  dataset_dir=I:\Papers\consumer\codeandpaper\Full\D1_partition\sampling@x@1@partition
            test_data_path = test_data_dir + r'\session_item.txt'
            session_item_data = rff.get_data_lists(test_data_path)
            # 测试数据的点击流数据(考虑商品的重复点击)
            test_click_stream_path = test_data_dir + r'\session_click_stream.txt'
            # 没有找到 “session_click_stream.txt”——后面,,,没有找到会生成。
            # 测试数据的点击数据文件
            test_file_path = yoochoose_selected_dir + r'\yoochoose-test-selected.dat'
            # 获取测试数据中各个session点击的item(item按点击顺序存放)(只考虑点击的不同商品,不考虑商品的重复点击——这是与session_click_stream的区别)
            dic, sessions, items_set = real_data.get_session_itemList(test_file_path)
            # 每个商品在各个session的出现次数(原静态特征,点击流场景下不会用到)
            item_session_times_dic = feature4.get_item_session_times(test_file_path)
            if os.path.exists(test_click_stream_path):
                session_click_stream = rff.get_int_list_dict(test_click_stream_path)
            else:
                session_click_stream = calcCorrelation.extract_click_stream(test_file_path)

            # res_path = res_dir + '\\' + dataset_para + '.txt'

            # 开始时初始化实验结果表格:输出行名、列名等信息
            if init_flag == 0:
                init_excel(res_dir, aggregate_num)
                # 表示表格已经初始化一次了,不用再初始化了
                init_flag = 1

            # 非early predict部分的实验(各种种整合策略)
            # 非ealry predict就是我们的论文里面的实验,本来还想做个early predict的东西,但是效果不好,就放弃了
            Recommendation22_aggregate.generate(click_file_path, buy_file_path, test_file_path,
                                                U, V, theta, aspects_num, session_item_data, dic, item_session_times_dic,
                                                session_click_stream, res_dir, part_num, aggregate_num)

            # (非early predict部分的实验(模型原始计算方法))
            # 重复加载一遍下面的数据的原因是Recommendation22_aggregate.generate()方法貌似会对其函数参数造成一定改变,导致后面的程序运行时结果发生
            # (续)一定的改变(已知知道会发生改变)。
            # 测试过程
            # 测试数据
            # test data/groundtruth
            test_data_path = test_data_dir + r'\session_item.txt'
            session_item_data = rff.get_data_lists(test_data_path)
            # 测试数据的点击流数据(考虑商品的重复点击)
            test_click_stream_path = test_data_dir + r'\session_click_stream.txt'
            # 测试数据的点击数据文件
            test_file_path = yoochoose_selected_dir + r'\yoochoose-test-selected.dat'
            # 获取测试数据中各个session点击的item(item按点击顺序存放)(只考虑点击的不同商品,不考虑商品的重复点击——这是与session_click_stream的区别)
            dic, sessions, items_set = real_data.get_session_itemList(test_file_path)
            # 每个商品在各个session的出现次数(原静态特征,点击流场景下不会用到)
            item_session_times_dic = feature4.get_item_session_times(test_file_path)
            if os.path.exists(test_click_stream_path):
                session_click_stream = rff.get_int_list_dict(test_click_stream_path)
            else:
                session_click_stream = calcCorrelation.extract_click_stream(test_file_path)

            # 非early predict部分的实验(原始方法)
            Recommendation11.generate(click_file_path, buy_file_path, test_file_path,
                                      U, V, theta, aspects_num, session_item_data, dic, item_session_times_dic,
                                      session_click_stream, res_dir, part_num)
Пример #11
0
def classifier_test():
    #[5,6,9,12,24,28,33,39,40,49]#
    L = [18, 19, 25, 28, 34, 38, 39, 40, 45, 47]
    for i in L:
        """
        D1:
        2,8,18,19,22,28,36,40,44,49
        
        D2:
        3,7,12,18,21,35,37,44,46,49
        
        D3:
        1,6,8,9,11,24,33,45,47,50
        
        D4:
        3,4,5,18,20,25,29,39,45,49

        """
        #if not (i==3 or i==4 or i==5 or i==18 or i==20 or i==25 or i==29 or i==39 or i==45 or i==49):
        #     continue
        print(i)
        # setting
        dataset_para = 'sampling@x@' + str(i) + '@partition'
        # 特征的选择:时间类特征:time; 新特征:new;  时间类特征+新特征: all
        feature = 'all'

        # 若用新特征,选择使用哪些特征
        feature_para = (1, 2, 3, 4)

        # file directory
        # feature_dir = dataset_dir + r'\feature1'

        # ###预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取)
        # 输入1:(实验数据)dataset_dir\train\session_item.txt  .\test\session_item.txt
        # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat  .\yoochoose-buys.dat  .\yoochoose-test.dat
        # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat  .\yoochoose-buys-selected.dat  .\yoochoose-test-selected.dat
        dataset_dir = r'F:\skyline recommendation\data4\D5_partition' + '\\' + dataset_para
        #yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data'
        # 输出路径
        #yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected'
        # 假如输出文件夹不存在,则创建文件夹
        # if not os.path.exists(yoochoose_selected_dir):
        #     os.makedirs(yoochoose_selected_dir)
        # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir)

        # ###提取特征
        # 输入:yoochoose selected data(及groundtruth)
        # 提取特征结果所在文件夹

        feature_dir = r'F:\skyline recommendation\data4\D5_partition' + '\\' + dataset_para + '\\feature'
        # feature_dir = dataset_dir + r'\feature1'
        # 假如输出文件夹不存在,则创建文件夹
        """
        if not os.path.exists(feature_dir):
            os.makedirs(feature_dir)
        if feature == 'time':
            print('feature:', feature)
            Feature5.go(dataset_dir, feature_dir)
        elif feature == 'new':
            print('feature:', feature, 'feature_para:', feature_para)
            Feature4.go(dataset_dir, feature_dir, feature_para)
        else:
            print('feature:', feature)
            Feature6.go(dataset_dir, feature_dir)
        """
        # 读取特征
        # 训练文件路径
        train_file_path = feature_dir + r'\click-buy-train.arff'
        # 测试文件路径
        test_file_path = feature_dir + r'\click-buy-test-BR.txt'
        groundtruth_path = dataset_dir + r'\test\session_item.txt'

        X_train, y_train = Input2.read_train(train_file_path)
        X_test, y_test, test_dic_data = Input2.read_test(
            test_file_path, groundtruth_path)

        groundtruth_path = dataset_dir + r'\test\session_item.txt'
        session_item_data = rff.get_data_lists(groundtruth_path)

        # 模型训练
        print('model: GradientBoostingClassifier')
        model = LogisticRegression()  # LogisticRegression()
        model.fit(X_train, y_train)

        #取第二列,即类为1的分数
        score = model.predict_proba(X_test)[:, 1]

        #y_predict = model.predict(X_test)
        # solution:根据预测结果,生成各个session其对应的购买商品
        #solution = Solution.generate(test_dic_data, y_predict)
        session_item_score_dic = extract_score_by_session2(
            score, test_dic_data)
        #print("test**************************************")
        p, MRR = recommendation1.evaluate(session_item_data,
                                          session_item_score_dic)
        # p1 = calc_precision_at_1(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic)
        # p2 = calc_precision_at_2(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic)
        # MRR = calc_MRR(session_score_dic_data, session_item_dic_data, session_item_data, session_idx_dic)
        # print('p1: ' + ('%.4f' % p1))
        # print('p2: ' + ('%.4f' % p2))
        # print('MRR: ' + ('%.4f' % MRR))
        #print precision
        f = open("F:\\skyline recommendation\\data4\\D5_partition\\D5_MRR.csv",
                 "a")

        writer = csv.writer(f)

        writer.writerow([i, ('%.4f' % p), ('%.4f' % MRR)])
        f.close()
        # 以上,只初始化一次,后面再次跑代码,结果是追加在上一次的结果之上的,
        """