Exemplo n.º 1
0
def build_model(model_opt, opt, fields, checkpoint):
    """ Build the Model """
    logger.info('Building model...')
    model = build_base_model(model_opt, fields,
                             use_gpu(opt), checkpoint)
    logger.info(model)
    return model
Exemplo n.º 2
0
def load_test_model(opt, dummy_opt):
    shared_fields = None
    shared_model_opt = None
    models = []
    for model_path in opt.models:
        checkpoint = torch.load(model_path,
                                map_location=lambda storage, loc: storage)
        fields = load_fields_from_vocab(checkpoint['vocab'])

        model_opt = checkpoint['opt']

        for arg in dummy_opt:
            if arg not in model_opt:
                model_opt.__dict__[arg] = dummy_opt[arg]

        model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint)
        model.eval()
        model.generator.eval()
        if shared_fields is None:
            shared_fields = fields
        if shared_model_opt is None:
            shared_model_opt = model_opt
        models.append(model)
    ensemble_model = EnsembleModel(models)
    return shared_fields, ensemble_model
Exemplo n.º 3
0
def build_loss_compute(model, tgt_vocab, opt, train=True):
    """
  Returns a LossCompute subclass which wraps around an nn.Module subclass
  (such as nn.NLLLoss) which defines the loss criterion. The LossCompute
  object allows this loss to be computed in shards and passes the relevant
  data to a Statistics object which handles training/validation logging.
  Currently, the NMTLossCompute class handles all loss computation except
  for when using a copy mechanism. Despite their name, LossCompute objects
  do not merely compute the loss but also perform the backward pass inside
  their sharded_compute_loss method.
  """
    device = torch.device("cuda" if use_gpu(opt) else "cpu")

    padding_idx = tgt_vocab.stoi[Constants.PAD_WORD]
    if opt.label_smoothing > 0 and train:
        criterion = LabelSmoothingLoss(opt.label_smoothing,
                                       len(tgt_vocab),
                                       ignore_index=padding_idx)
    else:
        criterion = nn.NLLLoss(ignore_index=padding_idx, reduction='sum')

    # if the loss function operates on vectors of raw logits instead of
    # probabilities, only the first part of the generator needs to be
    # passed to the NMTLossCompute. At the moment, the only supported
    # loss function of this kind is the sparsemax loss.
    loss_gen = model.generator
    compute = NMTLossCompute(criterion, loss_gen)
    compute.to(device)

    return compute
def load_test_model(opt, dummy_opt, model_path=None):
    if model_path is None:
        model_path = opt.models[0]
    checkpoint = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
    fields = load_fields_from_vocab(checkpoint['vocab'])

    model_opt = checkpoint['opt']

    for arg in dummy_opt:
        if arg not in model_opt:
            model_opt.__dict__[arg] = dummy_opt[arg]
    model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint)
    model.eval()
    model.generator.eval()
    return fields, model
Exemplo n.º 5
0
def build_optim(model, opt, checkpoint):
    """ Build optimizer """
    saved_optimizer_state_dict = None

    if opt.train_from and opt.reset_optim != 'all':
        optim = checkpoint['optim']
        # We need to save a copy of optim.optimizer.state_dict() for setting
        # the, optimizer state later on in Stage 2 in this method, since
        # the method optim.set_parameters(model.parameters()) will overwrite
        # optim.optimizer, and with ith the values stored in
        # optim.optimizer.state_dict()
        if opt.reset_optim != 'states':
            saved_optimizer_state_dict = optim.optimizer.state_dict()
            if opt.reset_optim == 'keep_states':
                optim.method = opt.optim
                optim.learning_rate = opt.learning_rate
                optim.original_lr = opt.learning_rate
                optim.max_grad_norm = opt.max_grad_norm
                optim.lr_decay = opt.learning_rate_decay
                optim.start_decay_steps = opt.start_decay_steps
                optim.decay_steps = opt.decay_steps
                optim.betas = [opt.adam_beta1, opt.adam_beta2]
                optim.adagrad_accum = opt.adagrad_accumulator_init
                optim.decay_method = opt.decay_method
                optim.warmup_steps = opt.warmup_steps
                optim.model_size = opt.dec_rnn_size
    else:
        optim = Optimizer(opt.optim,
                          opt.learning_rate,
                          opt.max_grad_norm,
                          lr_decay=opt.learning_rate_decay,
                          start_decay_steps=opt.start_decay_steps,
                          decay_steps=opt.decay_steps,
                          beta1=opt.adam_beta1,
                          beta2=opt.adam_beta2,
                          adagrad_accum=opt.adagrad_accumulator_init,
                          decay_method=opt.decay_method,
                          warmup_steps=opt.warmup_steps,
                          model_size=opt.dec_rnn_size)

    # Stage 1:
    # Essentially optim.set_parameters (re-)creates and optimizer using
    # model.paramters() as parameters that will be stored in the
    # optim.optimizer.param_groups field of the torch optimizer class.
    # Importantly, this method does not yet load the optimizer state, as
    # essentially it builds a new optimizer with empty optimizer state and
    # parameters from the model.
    optim.set_parameters(model.named_parameters())

    if opt.train_from and (opt.reset_optim in ['none', 'keep_states']):
        # Stage 2: In this stage, which is only performed when loading an
        # optimizer from a checkpoint, we load the saved_optimizer_state_dict
        # into the re-created optimizer, to set the optim.optimizer.state
        # field, which was previously empty. For this, we use the optimizer
        # state saved in the "saved_optimizer_state_dict" variable for
        # this purpose.
        # See also: https://github.com/pytorch/pytorch/issues/2830
        optim.optimizer.load_state_dict(saved_optimizer_state_dict)
        # Convert back the state values to cuda type if applicable
        if use_gpu(opt):
            for state in optim.optimizer.state.values():
                for k, v in state.items():
                    if torch.is_tensor(v):
                        state[k] = v.cuda()

        # We want to make sure that indeed we have a non-empty optimizer state
        # when we loaded an existing model. This should be at least the case
        # for Adam, which saves "exp_avg" and "exp_avg_sq" state
        # (Exponential moving average of gradient and squared gradient values)
        if (optim.method == 'adam') and (len(optim.optimizer.state) < 1):
            raise RuntimeError(
                "Error: loaded Adam optimizer from existing model" +
                " but optimizer state is empty")

    return optim