Exemplo n.º 1
0
    def load_pretrained_model(self, model_path=None):
        """
        Loads pretrained model weights into the current model.

        Args:
            model_path: optional string containing the path to load the model
                from. Default is generated from the config yaml file.
        """
        if model_path is None:
            model_path = "{}/model.params".format(
                utils.get_lm_path_of_args(self.args))
        tqdm.write("Loading model from {}".format(model_path))

        state_dict = torch.load(model_path, map_location=self.args['device'])
        self.load_state_dict(state_dict)
        self.to(self.args['device'])
Exemplo n.º 2
0
def main():
    argp = ArgumentParser()
    argp.add_argument('config')
    argp.add_argument('--train', action="store_true")
    argp.add_argument('--train-seq2seq', action="store_true")
    argp.add_argument('--train-truncation', action="store_true")
    argp_args = argp.parse_args()
    args = yaml.safe_load(open(argp_args.config))
    args['train-seq2seq'] = argp_args.train_seq2seq
    args['train-truncation'] = argp_args.train_truncation
    args[
        'train'] = argp_args.train or argp_args.train_truncation or argp_args.train_seq2seq
    args['device'] = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")

    # set random seed
    torch.manual_seed(args['lm']['seed'])
    np.random.seed(args['lm']['seed'])

    # prepare results and model directories
    output_dir = utils.get_results_dir_of_args(args)
    tqdm.write("Writing results to {}".format(output_dir))
    os.makedirs(output_dir, exist_ok=True)
    copyfile(argp_args.config, "{}/{}".format(output_dir, "config.yaml"))

    # seq2seq
    model_dir = utils.get_lm_path_of_args(args)
    if not model_dir.endswith(".params"):
        os.makedirs(model_dir, exist_ok=True)

    # Search for dataset
    dataset = dataset_lookup[args['data']['dataset_type']](args)

    # Run whatever experiment necessary
    if args['lm']['lm_type'] == 'rnnlm':
        run_lm(args, dataset)
    elif args['lm']['lm_type'] in seq2seq_lookup:
        # truncation
        model_dir = utils.get_trunc_model_path_of_args(args)
        if not model_dir.endswith(".params"):
            os.makedirs(model_dir, exist_ok=True)
        run_seq2seq(args, dataset)
Exemplo n.º 3
0
    argp = ArgumentParser()
    argp.add_argument('config')
    args = argp.parse_args()
    args = yaml.load(open(args.config))

    # Determine whether CUDA is available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    args['device'] = device

    # Construct the language model and dataset objects
    dataset = Dataset(args)
    input_size = args['lm']['embedding_dim']
    hidden_size = args['lm']['hidden_dim']
    recurrent_model = rnn.PytorchRecurrentModel(args, input_size, hidden_size,
                                                args['lm']['num_layers'])
    lm_model = lm.TraditionalLanguageModel(args, recurrent_model)

    # Prepare to write results
    output_dir = utils.get_results_dir_of_args(args)
    tqdm.write('Writing results to {}'.format(output_dir))
    os.makedirs(utils.get_results_dir_of_args(args), exist_ok=True)

    # Train and load most recent parameters
    train(args, lm_model, dataset.get_train_dataloader(),
          dataset.get_dev_dataloader())
    lm_model.load_state_dict(torch.load(utils.get_lm_path_of_args(args)))

    # Evaluate language model
    reporter.run_evals(args, lm_model, dataset, 'dev')
    reporter.run_evals(args, lm_model, dataset, 'test')
def train(args, lm, train_batches, dev_batches):
  """Trains the language model with Adam,

  Uses a learning rate annealing-on-plateau scheme,
  early stopping after 3 consecutive epochs bearing no improvement.

  Arguments:
    lm: a LanguageModel object
    train_batches: PyTorch DataLoader of training data from Dataset
    dev_batches: PyTorch DataLoader of dev data from Dataset
  """
  lm_params_path = utils.get_lm_path_of_args(args)
  optimizer = optim.Adam(lm.parameters(), args['training']['learning_rate'])
  scheduler_patience = 0
  max_epochs = args['training']['max_epochs']
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5,patience=scheduler_patience)
  steps_between_evals = len(train_batches)
  min_dev_loss = sys.maxsize
  min_dev_loss_epoch = -1
  loss = nn.CrossEntropyLoss()
  torch.save(lm.state_dict(), lm_params_path)
  total_gradient_steps = 0
  for epoch_index in tqdm(range(max_epochs), desc='[training]'):
    epoch_train_loss = 0
    epoch_dev_loss = 0
    train_batch_count = 0
    for observation_batch, label_batch, _ in tqdm(train_batches):
      # Compute forward, backward, and take gradient step
      train_batch_count+= 1
      lm.train()
      batch_size, seq_len = label_batch.size()
      logits, _ = lm(observation_batch)
      logits = logits.view(batch_size*seq_len, -1)
      label_batch = label_batch.view(batch_size*seq_len)
      batch_loss = loss(logits, label_batch)
      batch_loss.backward()
      optimizer.step()
      epoch_train_loss += batch_loss.detach().cpu().numpy()
      optimizer.zero_grad()
      total_gradient_steps += 1
      # Determine whether it's time to evaluate on dev data
      if total_gradient_steps % steps_between_evals == 0 and total_gradient_steps > 1:
        dev_batch_count = 0
        # Compute dev loss
        for observation_batch, label_batch, _ in tqdm(dev_batches, desc='[dev batch]', smoothing=0.01):
          dev_batch_count+= 1
          optimizer.zero_grad()
          lm.eval()
          batch_size, seq_len = label_batch.size()
          logits, _ = lm(observation_batch)
          logits = logits.view(batch_size*seq_len, -1)
          label_batch = label_batch.view(batch_size*seq_len)
          batch_loss = loss(logits, label_batch)
          epoch_dev_loss += batch_loss.detach().cpu().numpy()
        scheduler.step(epoch_dev_loss)
        epoch_dev_loss = epoch_dev_loss/ dev_batch_count
        epoch_train_loss = epoch_train_loss/ train_batch_count
        tqdm.write('[epoch {}] Train loss: {}, Dev loss: {}'.format(epoch_index,
            math.pow(epoch_train_loss,2), math.pow(epoch_dev_loss,2)))
        # If new best dev loss, save parameters.
        if epoch_dev_loss < min_dev_loss - 0.0001:
          torch.save(lm.state_dict(), lm_params_path)
          min_dev_loss = epoch_dev_loss
          min_dev_loss_epoch = epoch_index
          tqdm.write('Saving lm parameters')
        elif min_dev_loss_epoch < epoch_index - 2:
          tqdm.write('Early stopping')
          tqdm.write("Min dev loss: {}".format(math.pow(min_dev_loss,2)))
          return
    tqdm.write("Min dev loss: {}".format(math.pow(min_dev_loss,2)))
Exemplo n.º 5
0
def train(args, lm, data, batches_fn):
    """
    Trains given seq2seq model and saves it to disk.

    The Adam optimizer and cross-entropy loss is used.
    Args:
        args (dict): arguments from yaml config
        lm (models.Seq2Seq): model to train
        data (data.Seq2SeqDataset): contains the training and dev data
    """
    # Set up
    train_batches = data.get_train_dataloader(shuffle=True)
    dev_batches = data.get_dev_dataloader(shuffle=False)
    min_dev_ppl = float('inf')
    min_dev_loss_epoch = -1
    params_file = "{}/model.params".format(utils.get_lm_path_of_args(args))
    max_epochs = 100000

    # create optimizer
    optimizer = torch.optim.Adam(lm.parameters(), lr=args['lm']['lr'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           factor=0.1,
                                                           patience=0)

    # create loss function
    if args['data']['dataset_type'] == "dyckmk":
        pad_idx = data.vocab.word2idx['<pad>']
    else:
        pad_idx = data.target_vocab.word2idx['<pad>']
    criterion = nn.CrossEntropyLoss(
        ignore_index=pad_idx)  # have to fill with params

    # calculate and report initial dev loss (PPl should be ~10)
    epoch_train_loss, epoch_train_loss_count = (0, 1)
    with torch.no_grad():
        epoch_dev_loss, epoch_dev_loss_count = batches_fn(
            dev_batches, lm, optimizer, criterion, "dev")
    train_ppl = np.exp(epoch_train_loss / epoch_train_loss_count)
    dev_ppl = np.exp(epoch_dev_loss / epoch_dev_loss_count)
    tqdm.write("[Epoch: {}] Train ppl: {}, Dev ppl: {}".format(
        -1, train_ppl, dev_ppl))

    for epoch_idx in tqdm(range(max_epochs), desc='[training seq2seq]'):
        # run training and development epochs
        epoch_train_loss, epoch_train_loss_count = batches_fn(
            train_batches, lm, optimizer, criterion, "train")
        with torch.no_grad():
            epoch_dev_loss, epoch_dev_loss_count = batches_fn(
                dev_batches, lm, optimizer, criterion, "dev")

        # report training progress, update best model is needed
        train_ppl = np.exp(epoch_train_loss / epoch_train_loss_count)
        dev_ppl = np.exp(epoch_dev_loss / epoch_dev_loss_count)
        tqdm.write(
            "[Epoch: {}] Train ppl: {:.8f}, Dev ppl: {:.8f} ntrain:{}".format(
                epoch_idx, train_ppl, dev_ppl, epoch_train_loss_count))

        if dev_ppl < min_dev_ppl - 1e-4:
            min_dev_ppl = dev_ppl
            min_dev_loss_epoch = epoch_idx
            torch.save(lm.state_dict(), params_file)
            tqdm.write("Saving lm parameters")

        scheduler.step(epoch_dev_loss)

        # early stopping
        if min_dev_loss_epoch < epoch_idx - 5:
            tqdm.write("Early stopping")
            tqdm.write("Min dev ppl: {}".format(min_dev_ppl))
            return

    tqdm.write("Min dev ppl: {}".format(min_dev_ppl))