from dataset import LetorDataset import numpy as np from clickModel.LSTMv2 import LSTMv2 from utils import read_file as rf from clickModel.DCTR import DCTR train_path = "../datasets/ltrc_yahoo/test_set.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) click_log_path = "../datasets/ltrc_yahoo/test_click_log.txt" test_click_log_path = "../datasets/ltrc_yahoo/test_click_log_test.txt" click_log = rf.read_click_log(click_log_path) test_click_log = rf.read_click_log(test_click_log_path) pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] simulator = DCTR(pc) print(click_log.shape) print(test_click_log.shape) # click_model = LSTMv2(700, 1024, train_set) click_model.train(click_log) print( click_model.get_MSE( test_click_log[np.random.choice(test_click_log.shape[0], 100)], train_set, simulator))
if __name__ == "__main__": # %% train_path = "../datasets/ltrc_yahoo/set1.train.txt" test_path = "../datasets/ltrc_yahoo/set1.test.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) print("loading testing set.......") test_set = LetorDataset(test_path, 700) # %% # pc = [0.4, 0.6, 0.7, 0.8, 0.9] # ps = [0.1, 0.2, 0.3, 0.4, 0.5] pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] for id in range(1, 16): p1 = mp.Process(target=generate_dataset, args=(train_set, test_set, DCTR(pc), "../feature_click_datasets/DCTR/", id)) p2 = mp.Process(target=generate_dataset, args=(train_set, test_set, CM(pc), "../feature_click_datasets/CM/", id)) p3 = mp.Process(target=generate_dataset, args=(train_set, test_set, SDBN(pc, ps), "../feature_click_datasets/SDBN/", id)) p4 = mp.Process(target=generate_dataset, args=(train_set, test_set, SDCM(pc), "../feature_click_datasets/SDCM/", id)) p1.start() p2.start() p3.start() p4.start()
# %% if __name__ == "__main__": # %% train_path = "../datasets/ltrc_yahoo/set1.train.txt" test_path = "../datasets/ltrc_yahoo/set1.test.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) # %% # print("loading testing set.......") # test_set = LetorDataset(test_path, 700) pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] datasets_simulator = [('SDBN', SDBN(pc, ps)), ('SDCM', SDCM(pc)), ('CM', CM(pc)), ('DCTR', DCTR(pc))] # datasets = ['CM'] for dataset, simulator in datasets_simulator: for id in range(1, 2): click_log_path = "../feature_click_datasets/{}/train_set{}.txt".format( dataset, id) test_click_log_path = "../feature_click_datasets/{}/seen_set{}.txt".format( dataset, id) query_frequency_path = "../feature_click_datasets/{}/query_frequency{}.txt".format( dataset, id) click_log = rf.read_click_log(click_log_path) test_click_log = rf.read_click_log(test_click_log_path) query_frequency = rf.read_query_frequency(query_frequency_path) click_models = [SDBN(), SDCM(), CM(), DCTR()]
# %% if __name__ == "__main__": # %% train_path = "../datasets/ltrc_yahoo/set1.LetorDataset.txt" print("loading training set.......") with open(train_path, "rb") as fp: train_set = pickle.load(fp) # %% pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] datasets_simulator = [ ('SDBN', SDBN(pc, ps)), # ('SDCM', SDCM(pc)), # ('CM', CM(pc)), ('DCTR', DCTR(pc)), ('UBM', UBM(pc)), # ('SDBN_reverse', SDBN_reverse(pc, ps)) ] progress = 0 for dataset, simulator in datasets_simulator: for id in range(2, 16): click_log_path = "../click_logs/{}/train_set{}.txt".format( dataset, id) test_click_log_path = "../click_logs/{}/seen_set{}.txt".format( dataset, id) query_frequency_path = "../click_logs/{}/query_frequency{}.txt".format( dataset, id) click_log = rf.read_click_log(click_log_path) test_click_log = rf.read_click_log(test_click_log_path)
generator = "Mixed" click_log_path = "../feature_click_datasets/{}/train_set1.txt".format(generator) test_click_log_path = "../feature_click_datasets/{}/seen_set1.txt".format(generator) click_log = rf.read_click_log(click_log_path) test_click_log = rf.read_click_log(test_click_log_path) # # dataset = tf.data.TFRecordDataset(filenames='../feature_click_datasets/{}/train_set1.tfrecord'.format(generator)) # # # test_dataset = tf.data.TFRecordDataset(filenames='../feature_click_datasets/SDBN/seen_set1.tfrecord') # # #%% pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] Mixed_models = [DCTR(pc), CM(pc), SDBN(pc, ps), SDCM(pc), UBM(pc)] simulator = Mixed(Mixed_models) print(click_log.shape) print(test_click_log.shape) # click_model = LSTMv2(700, 1024, train_set, batch_size=128, epoch=5) print(click_model.get_MSE(test_click_log[np.random.choice(test_click_log.shape[0], 1000)], train_set, simulator)) click_model.train(dataset) print(click_model.get_MSE(test_click_log[np.random.choice(test_click_log.shape[0], 1000)], train_set, simulator)) click_model.model.save("../click_model_results/LSTM_models/{}_train_set1.h5".format(generator)) # test model
for freq in frequencies: perplexities = click_model.get_perplexity(np.array(test_logs[freq])) MSEs = click_model.get_MSE(np.array(test_logs[freq]), dataset, simulator) perplexity_line = "Frequency " + freq + " perplexities:" MSEs_line = "Frequency " + freq + " MSE:" for perp in perplexities: perplexity_line += " " + str(perp) for MSE in MSEs: MSEs_line += " " + str(MSE) f.write(perplexity_line + "\n") f.write(MSEs_line + "\n") f.close() if __name__ == "__main__": pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] Mixed_models = [DCTR(pc), SDBN(pc, ps), UBM(pc)] # simulators = [SDBN(pc, ps), Mixed(Mixed_models), DCTR(pc), UBM(pc)] simulators = [SDBN(pc, ps), DCTR(pc), UBM(pc)] dataset_path = "../datasets/ltrc_yahoo/set1.train.txt" print("loading training set.......") dataset = LetorDataset(dataset_path, 700) for r in range(1, 2): for simulator in simulators: run(simulator, dataset, r)
from clickModel.RCTR import RCTR from clickModel.Mixed import Mixed from utils import read_file as rf from utils import utility from dataset import LetorDataset # import matplotlib.pyplot as plt import numpy as np import multiprocessing as mp train_path = "../datasets/ltrc_yahoo/set1.train.txt" print("loading training set.......") train_set = LetorDataset(train_path, 700) pc = [0.05, 0.3, 0.5, 0.7, 0.95] ps = [0.2, 0.3, 0.5, 0.7, 0.9] mixed_models = [DCTR(pc), SDBN(pc, ps), UBM(pc)] datasets_simulator = [ ('SDBN', SDBN(pc, ps)), # ('SDCM', SDCM(pc)), # ('CM', CM(pc)), ('DCTR', DCTR(pc)), ('UBM', UBM(pc)), ('SDBN_reverse', SDBN_reverse(pc, ps)) ] click_model = RCTR() for dataset, simulator in datasets_simulator: for id in range(1, 16): click_log_path = "../click_logs/{}/train_set{}.txt".format(dataset, id) click_log = rf.read_click_log(click_log_path) click_model.train(click_log)