Пример #1
0
def task_seg_ml_sklearn_crossval():
  """
  Run ML Scikit learn methods for Sen et al. and Hagen et al. dataset task 
  segmentation
  """
  classifiers = methods.multiplelearners()

  print('\n\n--- Experiment with Sen et al. dataset ')
  sen_aol = datasets.lucchese_aol(
    representation=datasets.representation().glove)
  sen_aol.load_sequential_queries()
  classifiers.run(sen_aol)

  dataset = datasets.lucchese_aol()
  dataset.load_sequential_pair()
  methods.task_rules(dataset=dataset).test()

  print('\n\n--- Experiment with Hagen et al. dataset ')
  hagen_aol = datasets.hagen_aol(
    representation=datasets.representation().glove)
  hagen_aol.load_sequential_queries()
  classifiers.run(hagen_aol)
  
  dataset = datasets.hagen_aol()
  dataset.load_sequential_pair()
  methods.task_rules(dataset=dataset).test()
Пример #2
0
def task_ext(representation=datasets.representation(width=8).glove, 
  save_dir='models/irdcs_model'):
  lambda_loss = 0.1
  cell = 'GRU'
  ui, lr, batch, mi = 1, 1e-5, 128, 150

  # Datasets
  qdatasets = []
  qdataset = datasets.sen_aol(representation=representation)
  qdataset.load_augmented(textdata=False)
  qdatasets.append(qdataset)
  qdataset = datasets.volske_trek(representation=representation)
  qdataset.load_augmented_filter_user(textdata=False)
  qdatasets.append(qdataset)
  qdatasets_names = ['Sen et al.', 'Volske et al. Trec ']

  # Clustering
  for i in range(len(qdatasets)):
    sen_aol = qdatasets[i]
    ds_name = qdatasets_names[i]

    print('\n\n--- Experiment RDC ' + ds_name + ' dataset, sequence pretrain')
    hagen_aol = datasets.hagen_aol(representation=representation)
    hagen_aol.load_random_pair_dual()
    pe = 10; plr = 1e-5
    methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol,
      save_dir=save_dir, rnn=deep_clustering.IRDCS, 
      pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, 
      learning_rate=lr, pretrain=True, pretrain_lr=plr, cell=cell)
    
    print('\n\n--- Experiment RDC ' + ds_name + ' dataset, segmentation pretrain')
    hagen_aol = datasets.hagen_aol(representation=representation)
    hagen_aol.load_sequential_pair_dual()
    pe = 30; plr = 1e-4
    methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol,
      save_dir=save_dir, rnn=deep_clustering.IRDCS, 
      pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, 
      learning_rate=lr, pretrain=True, pretrain_lr=plr, cell=cell)

    print('\n\n--- Experiment RDC ' + ds_name + ' dataset, no pretrain')
    methods.dc_rnn(pretrain_dataset=hagen_aol, dataset=sen_aol,
      save_dir=save_dir, rnn=deep_clustering.IRDCS, 
      pretrain_epochs=pe, batch_size=batch, maxiter=mi, update_interval=ui, 
      learning_rate=lr, pretrain=False, pretrain_lr=plr, cell=cell, lambda_loss=lambda_loss)
Пример #3
0
def task_seg_context():
  """
  Use adjacent queries to add context to the query pair. Use Hagen et al., 2013
  dataset and the BRNN architecture with GRU cells
  """

  for m in range(0, 6):
    hagen_aol = datasets.hagen_aol(
      representation=datasets.representation().glove)
    hagen_aol.load_sequential_queries(m=m, n=m+1)
    hagen_aol.kfold()

    brnn = methods.brnn()
    brnn.MODEL_DIR  = 'models/rnn_model_context'
    brnn.ITERATIONS = 60000
    brnn.CELL = 'GRU'
    brnn.BATCH_SIZE = 256
    description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
      brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1)
    print(description, file=sys.stderr)
    print(description)
    brnn.crossval(hagen_aol)
    del hagen_aol

    if m == 0:
      continue
      
    hagen_aol = datasets.hagen_aol(
      representation=datasets.representation().glove)
    hagen_aol.load_sequential_queries(m=m, n=1)
    hagen_aol.kfold()

    brnn = methods.brnn()
    brnn.MODEL_DIR  = 'models/rnn_model_context'
    brnn.ITERATIONS = 60000
    brnn.CELL = 'GRU'
    brnn.BATCH_SIZE = 256
    description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
      brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(1)
    print(description, file=sys.stderr)
    print(description)
    brnn.crossval(hagen_aol)
    del hagen_aol
Пример #4
0
def task_seg_crossval_brnn():
  """ 
  Run a experiment with the bidirectional RNN to detect session changes in
  the AOL dataset from (Sen et al., 2018) and (Hagen et al., 2013)
  """

  m = 0
  hagen_aol=datasets.hagen_aol(representation=datasets.representation().glove)
  hagen_aol.load_sequential_queries(m=m, n=m+1)
  hagen_aol.kfold()

  brnn = methods.brnn()
  brnn.MODEL_DIR  = 'models/rnn_model_transfer'
  brnn.ITERATIONS = 60000
  brnn.CELL = 'LSTM'
  brnn.BATCH_SIZE = 256
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1)
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(hagen_aol)
  del hagen_aol

  m = 0
  sen_aol=datasets.lucchese_aol(representation=datasets.representation().glove)
  sen_aol.load_sequential_queries(m=m, n=m+1)
  sen_aol.kfold()

  brnn = methods.brnn()
  brnn.MODEL_DIR  = 'models/rnn_model_transfer'
  brnn.ITERATIONS = 20000
  brnn.CELL = 'LSTM'
  brnn.BATCH_SIZE = 256
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Sen et al. dataset. m,n= ' + str(m) +',' + str(m+1)
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(sen_aol, train_eval_runs=1)
  del sen_aol

  m = 0
  hagen_aol=datasets.hagen_aol(representation=datasets.representation().glove)
  hagen_aol.load_sequential_queries(m=m, n=m+1)
  hagen_aol.kfold()

  brnn = methods.brnn()
  brnn.MODEL_DIR  = 'models/rnn_model_transfer'
  brnn.ITERATIONS = 60000
  brnn.CELL = 'GRU'
  brnn.BATCH_SIZE = 256
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Hagen et al. dataset. m,n= ' + str(m) +',' + str(m+1)
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(hagen_aol)
  del hagen_aol

  m = 0
  sen_aol=datasets.lucchese_aol(representation=datasets.representation().glove)
  sen_aol.load_sequential_queries(m=m, n=m+1)
  sen_aol.kfold()

  brnn = methods.brnn()
  brnn.MODEL_DIR  = 'models/rnn_model_transfer'
  brnn.ITERATIONS = 20000
  brnn.CELL = 'GRU'
  brnn.BATCH_SIZE = 256
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Sen et al. dataset. m,n= ' + str(m) +',' + str(m+1)
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(sen_aol, train_eval_runs=1)
  del sen_aol
Пример #5
0
def task_seg_transfer_learning():
  """
  Pretrain BRNN on Hagen et al., 2013 dataset. Then, fine tune for Sen et al., 2018 session segmentation
  """

  model_dir = 'models/rnn_model'
  pretrain_model_dir = 'models/rnn_model_transfer_pretrain' 
  
  hagen_aol = datasets.hagen_aol(
    representation=datasets.representation().glove)
  hagen_aol.load_sequential_queries()
  brnn = methods.brnn()
  brnn.MODEL_DIR  = pretrain_model_dir
  brnn.ITERATIONS = 40000
  brnn.CELL = 'GRU'
  brnn.BATCH_SIZE = 256 
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + \
    '. Sen et al. dataset. Pretraining with Hagen et al., test set 10%' 
  print(description, file=sys.stderr)
  print(description)
  brnn.train_test(hagen_aol, test_size=0.1)
  del hagen_aol

  sen_aol = datasets.lucchese_aol(
    representation=datasets.representation().glove)
  sen_aol.load_sequential_queries()
  sen_aol.kfold()
  brnn = methods.brnn()
  brnn.MODEL_DIR  = model_dir
  brnn.ITERATIONS = 40000 + 20000
  brnn.CELL = 'GRU'
  brnn.BATCH_SIZE = 256 
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Sen et al. dataset, crossval, fine tuning' 
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(sen_aol, transfer=True, pretrain_dir=pretrain_model_dir)
  del sen_aol

  hagen_aol = datasets.hagen_aol(
    representation=datasets.representation().glove)
  hagen_aol.load_sequential_queries()
  brnn = methods.brnn()
  brnn.MODEL_DIR  = pretrain_model_dir
  brnn.ITERATIONS = 40000
  brnn.CELL = 'LSTM'
  brnn.BATCH_SIZE = 256 
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + \
    '. Sen et al. dataset. Pretraining with Hagen et al., test set 10%' 
  print(description, file=sys.stderr)
  print(description)
  brnn.train_test(hagen_aol, test_size=0.1)
  del hagen_aol

  sen_aol = datasets.lucchese_aol(
    representation=datasets.representation().glove)
  sen_aol.load_sequential_queries()
  sen_aol.kfold()
  brnn = methods.brnn()
  brnn.MODEL_DIR  = model_dir
  brnn.ITERATIONS = 40000 + 20000
  brnn.CELL = 'LSTM'
  brnn.BATCH_SIZE = 256 
  description = '\n\n--- Experiment with ' + str(brnn.HIDDEN_UNITS) + \
    brnn.CELL + '. Sen et al. dataset, crossval, fine tuning' 
  print(description, file=sys.stderr)
  print(description)
  brnn.crossval(sen_aol, transfer=True, pretrain_dir=pretrain_model_dir)
  del sen_aol