Python LetorDataset示例，dataset.LetorDataset.LetorDataset Python示例

示例#1

0

显示文件

文件： run_PDGD_groups_mixed.py 项目： ArvinZhuang/OLTR


if __name__ == "__main__":
    FEATURE_SIZE = 105
    NUM_INTERACTION = 2000000
    click_models = ["informational", "navigational", "perfect"]
    # click_models = ["perfect"]
    Learning_rate = 0.1

    num_groups = 4

    dataset_path = "datasets/clueweb09_intent_change.txt"
    intent_path = "intents"
    output_fold = "results/SDBN/PDGD/group_mixed_2m"

    train_set = LetorDataset(dataset_path,
                             FEATURE_SIZE,
                             query_level_norm=True,
                             binary_label=True)

    intent_paths = [
        "{}/1.txt".format(intent_path), "{}/2.txt".format(intent_path),
        "{}/3.txt".format(intent_path), "{}/4.txt".format(intent_path)
    ]

    for click_model in click_models:
        mp.Process(target=job,
                   args=(click_model, Learning_rate, NUM_INTERACTION, 1,
                         train_set, intent_paths, output_fold,
                         num_groups)).start()

示例#2

0

显示文件

                .format(f, model_type, 2, r), "wb") as fp:
            pickle.dump(cndcg_scores2, fp)
        with open(
                "../results/reduction/mq2007/PDGD/fold{}/{}_ranker{}_run{}_final_weight.txt"
                .format(f, model_type, 2, r), "wb") as fp:
            pickle.dump(final_weight2, fp)
        print("PDGD tau{} fold{} {} run{} finished!".format(
            tau, f, model_type, r))


if __name__ == "__main__":

    FEATURE_SIZE = 46
    NUM_INTERACTION = 10000
    # click_models = ["informational", "navigational", "perfect"]
    click_model = "informational"
    Learning_rate = 0.1
    dataset_fold = "../datasets/2007_mq_dataset"
    output_fold = "mq2007"
    # taus = [0.1, 0.5, 1.0, 5.0, 10.0]
    tau = 1
    # for 5 folds
    for f in range(1, 6):
        training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
        test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
        train_set = LetorDataset(training_path, FEATURE_SIZE)
        test_set = LetorDataset(test_path, FEATURE_SIZE)

        mp.Process(target=job,
                   args=(click_model, f, train_set, test_set, tau)).start()

示例#3

0

显示文件

文件： run_PDGD_batch_update.py 项目： ArvinZhuang/OLTR

                    if click_model == "perfect":
                        pc = [0.0, 0.2, 0.4, 0.8, 1.0]
                        ps = [0.0, 0.0, 0.0, 0.0, 0.0]
                    elif click_model == "navigational":
                        pc = [0.05, 0.3, 0.5, 0.7, 0.95]
                        ps = [0.2, 0.3, 0.5, 0.7, 0.9]
                    elif click_model == "informational":
                        pc = [0.4, 0.6, 0.7, 0.8, 0.9]
                        ps = [0.1, 0.2, 0.3, 0.4, 0.5]

                for f in fold_range:
                    training_path = "{}/Fold{}/train.txt".format(
                        dataset_fold, f)
                    test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
                    train_set = LetorDataset(training_path,
                                             FEATURE_SIZE,
                                             query_level_norm=norm,
                                             cache_root="../datasets/cache")
                    test_set = LetorDataset(test_path,
                                            FEATURE_SIZE,
                                            query_level_norm=norm,
                                            cache_root="../datasets/cache")

                    print(dataset_fold, click_model, f, batch_size)
                    p = mp.Process(target=job,
                                   args=(click_model, f, train_set, test_set,
                                         output_fold, batch_size, pc, ps))
                    p.start()
                    processors.append(p)
            for p in processors:
                p.join()

示例#4

0

显示文件

文件： generate_click_dataset.py 项目： ljxalpha/OLTR-AIStudio

        if cm.name == "Mixed":
            line += s_name + " "
        line += "\n"
        f.write(line)
        # if index % 10000 == 0:
        # print("write %d/%d queries" % (index, num_queries))
    f.close()
    print(cm.name, "unseen_set finished!")


# %%
if __name__ == "__main__":
    # %%
    train_path = "../datasets/ltrc_yahoo/set1.train.txt"
    print("loading training set.......")
    train_set = LetorDataset(train_path, 700)

    # %%
    # pc = [0.4, 0.6, 0.7, 0.8, 0.9]
    # ps = [0.1, 0.2, 0.3, 0.4, 0.5]
    pc = [0.05, 0.3, 0.5, 0.7, 0.95]
    ps = [0.2, 0.3, 0.5, 0.7, 0.9]

    # click_models = [DCTR(pc), SDBN(pc, ps), UBM(pc), SDBN_reverse(pc, ps)]
    # Mixed_model = Mixed(click_models)
    # simulators = [DCTR(pc), SDBN(pc, ps), UBM(pc), SDBN_reverse(pc, ps), Mixed_model]
    simulators = [SDBN_reverse(pc, ps)]

    for id in range(2, 16):
        pool = []
        for cm in simulators:

示例#5

0

显示文件

文件： run_PMGD.py 项目： ArvinZhuang/OLTR

    NUM_INTERACTION = 100000
    click_models = ["informational", "perfect"]
    # click_models = ["perfect"]
    # dataset_fold = "../datasets/MSLR10K"
    # dataset_fold = "../datasets/2007_mq_dataset"
    output_fold = "results/istella/DBGD"
    # output_fold = "results/mslr10k/DBGD"
    # output_fold = "results/yahoo/PMGD"
    # taus = [0.1, 0.5, 1.0, 5.0, 10.0]
    alpha = 0.01
    delta = 1
    num_rankers = 1

    # for 5 folds
    for f in range(1, 2):
        # training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
        # test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
        training_path = "../datasets/istella/train.txt"
        test_path = "../datasets/istella/test.txt"
        print("loading dataset.....")
        # training_path = "../datasets/ltrc_yahoo/set1.train.txt"
        # test_path = "../datasets/ltrc_yahoo/set1.test.txt"
        train_set = LetorDataset(training_path, FEATURE_SIZE, query_level_norm=True)
        test_set = LetorDataset(test_path, FEATURE_SIZE, query_level_norm=True)

        # for 3 click_models
        for click_model in click_models:
            p = mp.Process(target=job, args=(click_model, f, train_set, test_set,
                                             delta, alpha, FEATURE_SIZE, num_rankers, output_fold))
            p.start()

示例#6

0

显示文件

文件： run_LSH_multiranker.py 项目： hscells/OLTR

 Learning_rate = 0.1
 dataset_fold = "../datasets/2007_mq_dataset"
 output_fold = "mq2007"
 tau = 1.0
 # for 5 folds
 # for f in range(1, 6):
 #     training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
 #     test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
 #     train_set = LetorDataset(training_path, FEATURE_SIZE)
 #     test_set = LetorDataset(test_path, FEATURE_SIZE)
 #
 #     # for 3 click_models
 #     for click_model in click_models:
 #         mp.Process(target=job, args=(click_model, f, train_set, test_set, tau)).start()
 training_path = "{}/Fold{}/train.txt".format(dataset_fold, 1)
 train_set = LetorDataset(training_path, FEATURE_SIZE)
 lsh = LSHash(2, FEATURE_SIZE, num_hashtables=2)
 ranker1 = []
 ranker2 = []
 ranker3 = []
 ranker4 = []
 print(len(train_set.get_all_querys()))
 for q in train_set.get_all_querys():
     query = np.mean(train_set.get_all_features_by_query(q), axis=0)
     code = lsh._hash(lsh.uniform_planes[0], query)
     print(code)
     print(lsh._hash(lsh.uniform_planes[1], query))
     print()
     if code == '00':
         ranker1.append(q)
     elif code == '01':

示例#7

0

显示文件

                    exist_ok=True)  # create directory if not exist
        with open(
                "{}/fold{}/{}_run{}_ndcg.txt".format(output_fold, f, model_type, r),
                "wb") as fp:
            pickle.dump(ndcg_scores, fp)
        with open(
                "{}/fold{}/{}_run{}_cndcg.txt".format(output_fold, f, model_type, r),
                "wb") as fp:
            pickle.dump(cndcg_scores, fp)
        with open(
                "{}/fold{}/{}_run{}_weights.txt".format(output_fold, f, model_type, r),
                "wb") as fp:
            pickle.dump(final_weights, fp)
        print("PDGD fold{} {} run{} finished!".format(f, model_type, r))


if __name__ == "__main__":
    # click_models = ["informational", "navigational", "perfect"]
    click_models = ["perfect", "informational"]
    dataset_fold = "../datasets/MSLR10k"
    output_fold = "../results/exploration/PDGD/MSLR10K/random"
    # for 5 folds

    for f in range(1, 2):
        training_path = "{}/Fold{}/train.txt".format(dataset_fold, f)
        test_path = "{}/Fold{}/test.txt".format(dataset_fold, f)
        train_set = LetorDataset(training_path, FEATURE_SIZE, query_level_norm=True, cache_root="../datasets/cache", binary_label=3)
        test_set = LetorDataset(test_path, FEATURE_SIZE, query_level_norm=True, cache_root="../datasets/cache", binary_label=3)
        for click_model in click_models:
            mp.Process(target=job, args=(click_model, f, train_set, test_set, output_fold)).start()