def task_seg_ml_sklearn_crossval(): """ Run ML Scikit learn methods for Sen et al. and Hagen et al. dataset task segmentation """ classifiers = methods.multiplelearners() print('\n\n--- Experiment with Sen et al. dataset ') sen_aol = datasets.lucchese_aol( representation=datasets.representation().glove) sen_aol.load_sequential_queries() classifiers.run(sen_aol) dataset = datasets.lucchese_aol() dataset.load_sequential_pair() methods.task_rules(dataset=dataset).test() print('\n\n--- Experiment with Hagen et al. dataset ') hagen_aol = datasets.hagen_aol( representation=datasets.representation().glove) hagen_aol.load_sequential_queries() classifiers.run(hagen_aol) dataset = datasets.hagen_aol() dataset.load_sequential_pair() methods.task_rules(dataset=dataset).test()
def task_ext(representation=datasets.representation(width=8).glove, save_dir='models/irdcs_model'): lambda_loss = 0.1 cell = 'GRU' ui, lr, batch, mi = 1, 1e-5, 128, 150 # Datasets qdatasets = [] qdataset = datasets.sen_aol(representation=representation) qdataset.load_augmented(textdata=False) qdatasets.append(qdataset) qdataset = datasets.volske_trek(representation=representation) qdataset.load_augmented_filter_user(textdata=False) qdatasets.append(qdataset) qdatasets_names = ['Sen et al.', 'Volske et al. Trec '] # Clustering for i in range(len(qdatasets)): sen_aol = qdatasets[i] ds_name = qdatasets_names[i] print('\n\n--- Experiment RDC ' + ds_name + ' dataset, sequence pretrain') hagen_aol = datasets.hagen_aol(representation=representation) hagen_aol.load_random_pair_dual() pe = 10; plr = 1e-5 methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol, save_dir=save_dir, rnn=deep_clustering.IRDCS, pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, learning_rate=lr, pretrain=True, pretrain_lr=plr, cell=cell) print('\n\n--- Experiment RDC ' + ds_name + ' dataset, segmentation pretrain') hagen_aol = datasets.hagen_aol(representation=representation) hagen_aol.load_sequential_pair_dual() pe = 30; plr = 1e-4 methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol, save_dir=save_dir, rnn=deep_clustering.IRDCS, pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, learning_rate=lr, pretrain=True, pretrain_lr=plr, cell=cell) print('\n\n--- Experiment RDC ' + ds_name + ' dataset, no pretrain') methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol, save_dir=save_dir, rnn=deep_clustering.IRDCS, pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, learning_rate=lr, pretrain=False, pretrain_lr=plr, cell=cell, lambda_loss=lambda_loss)
def task_seg_context(): """ Use adjacent queries to add context to the query pair. Use Hagen et al., 2013 dataset and the BRNN architecture with GRU cells """ for m in range(0, 6): hagen_aol = datasets.hagen_aol( representation=datasets.representation().glove) hagen_aol.load_sequential_queries(m=m, n=m+1) hagen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_context' brnn.ITERATIONS = 60000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1) print(description, file=sys.stderr) print(description) brnn.crossval(hagen_aol) del hagen_aol if m == 0: continue hagen_aol = datasets.hagen_aol( representation=datasets.representation().glove) hagen_aol.load_sequential_queries(m=m, n=1) hagen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_context' brnn.ITERATIONS = 60000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(1) print(description, file=sys.stderr) print(description) brnn.crossval(hagen_aol) del hagen_aol
def task_seg_crossval_brnn(): """ Run a experiment with the bidirectional RNN to detect session changes in the AOL dataset from (Sen et al., 2018) and (Hagen et al., 2013) """ m = 0 hagen_aol=datasets.hagen_aol(representation=datasets.representation().glove) hagen_aol.load_sequential_queries(m=m, n=m+1) hagen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_transfer' brnn.ITERATIONS = 60000 brnn.CELL = 'LSTM' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1) print(description, file=sys.stderr) print(description) brnn.crossval(hagen_aol) del hagen_aol m = 0 sen_aol=datasets.lucchese_aol(representation=datasets.representation().glove) sen_aol.load_sequential_queries(m=m, n=m+1) sen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_transfer' brnn.ITERATIONS = 20000 brnn.CELL = 'LSTM' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Sen et al. dataset. m,n= ' + str(m) +',' + str(m+1) print(description, file=sys.stderr) print(description) brnn.crossval(sen_aol, train_eval_runs=1) del sen_aol m = 0 hagen_aol=datasets.hagen_aol(representation=datasets.representation().glove) hagen_aol.load_sequential_queries(m=m, n=m+1) hagen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_transfer' brnn.ITERATIONS = 60000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1) print(description, file=sys.stderr) print(description) brnn.crossval(hagen_aol) del hagen_aol m = 0 sen_aol=datasets.lucchese_aol(representation=datasets.representation().glove) sen_aol.load_sequential_queries(m=m, n=m+1) sen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = 'models/rnn_model_transfer' brnn.ITERATIONS = 20000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Sen et al. dataset. m,n= ' + str(m) +',' + str(m+1) print(description, file=sys.stderr) print(description) brnn.crossval(sen_aol, train_eval_runs=1) del sen_aol
def task_seg_transfer_learning(): """ Pretrain BRNN on Hagen et al., 2013 dataset. Then, fine tune for Sen et al., 2018 session segmentation """ model_dir = 'models/rnn_model' pretrain_model_dir = 'models/rnn_model_transfer_pretrain' hagen_aol = datasets.hagen_aol( representation=datasets.representation().glove) hagen_aol.load_sequential_queries() brnn = methods.brnn() brnn.MODEL_DIR = pretrain_model_dir brnn.ITERATIONS = 40000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + \ '. Sen et al. dataset. Pretraining with Hagen et al., test set 10%' print(description, file=sys.stderr) print(description) brnn.train_test(hagen_aol, test_size=0.1) del hagen_aol sen_aol = datasets.lucchese_aol( representation=datasets.representation().glove) sen_aol.load_sequential_queries() sen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = model_dir brnn.ITERATIONS = 40000 + 20000 brnn.CELL = 'GRU' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Sen et al. dataset, crossval, fine tuning' print(description, file=sys.stderr) print(description) brnn.crossval(sen_aol, transfer=True, pretrain_dir=pretrain_model_dir) del sen_aol hagen_aol = datasets.hagen_aol( representation=datasets.representation().glove) hagen_aol.load_sequential_queries() brnn = methods.brnn() brnn.MODEL_DIR = pretrain_model_dir brnn.ITERATIONS = 40000 brnn.CELL = 'LSTM' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + \ '. Sen et al. dataset. Pretraining with Hagen et al., test set 10%' print(description, file=sys.stderr) print(description) brnn.train_test(hagen_aol, test_size=0.1) del hagen_aol sen_aol = datasets.lucchese_aol( representation=datasets.representation().glove) sen_aol.load_sequential_queries() sen_aol.kfold() brnn = methods.brnn() brnn.MODEL_DIR = model_dir brnn.ITERATIONS = 40000 + 20000 brnn.CELL = 'LSTM' brnn.BATCH_SIZE = 256 description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \ brnn.CELL + '. Sen et al. dataset, crossval, fine tuning' print(description, file=sys.stderr) print(description) brnn.crossval(sen_aol, transfer=True, pretrain_dir=pretrain_model_dir) del sen_aol