def build_model(model_opt, opt, fields, checkpoint): """ Build the Model """ logger.info('Building model...') model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint) logger.info(model) return model
def load_test_model(opt, dummy_opt): shared_fields = None shared_model_opt = None models = [] for model_path in opt.models: checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) fields = load_fields_from_vocab(checkpoint['vocab']) model_opt = checkpoint['opt'] for arg in dummy_opt: if arg not in model_opt: model_opt.__dict__[arg] = dummy_opt[arg] model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint) model.eval() model.generator.eval() if shared_fields is None: shared_fields = fields if shared_model_opt is None: shared_model_opt = model_opt models.append(model) ensemble_model = EnsembleModel(models) return shared_fields, ensemble_model
def build_loss_compute(model, tgt_vocab, opt, train=True): """ Returns a LossCompute subclass which wraps around an nn.Module subclass (such as nn.NLLLoss) which defines the loss criterion. The LossCompute object allows this loss to be computed in shards and passes the relevant data to a Statistics object which handles training/validation logging. Currently, the NMTLossCompute class handles all loss computation except for when using a copy mechanism. Despite their name, LossCompute objects do not merely compute the loss but also perform the backward pass inside their sharded_compute_loss method. """ device = torch.device("cuda" if use_gpu(opt) else "cpu") padding_idx = tgt_vocab.stoi[Constants.PAD_WORD] if opt.label_smoothing > 0 and train: criterion = LabelSmoothingLoss(opt.label_smoothing, len(tgt_vocab), ignore_index=padding_idx) else: criterion = nn.NLLLoss(ignore_index=padding_idx, reduction='sum') # if the loss function operates on vectors of raw logits instead of # probabilities, only the first part of the generator needs to be # passed to the NMTLossCompute. At the moment, the only supported # loss function of this kind is the sparsemax loss. loss_gen = model.generator compute = NMTLossCompute(criterion, loss_gen) compute.to(device) return compute
def load_test_model(opt, dummy_opt, model_path=None): if model_path is None: model_path = opt.models[0] checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) fields = load_fields_from_vocab(checkpoint['vocab']) model_opt = checkpoint['opt'] for arg in dummy_opt: if arg not in model_opt: model_opt.__dict__[arg] = dummy_opt[arg] model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint) model.eval() model.generator.eval() return fields, model
def build_optim(model, opt, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if opt.train_from and opt.reset_optim != 'all': optim = checkpoint['optim'] # We need to save a copy of optim.optimizer.state_dict() for setting # the, optimizer state later on in Stage 2 in this method, since # the method optim.set_parameters(model.parameters()) will overwrite # optim.optimizer, and with ith the values stored in # optim.optimizer.state_dict() if opt.reset_optim != 'states': saved_optimizer_state_dict = optim.optimizer.state_dict() if opt.reset_optim == 'keep_states': optim.method = opt.optim optim.learning_rate = opt.learning_rate optim.original_lr = opt.learning_rate optim.max_grad_norm = opt.max_grad_norm optim.lr_decay = opt.learning_rate_decay optim.start_decay_steps = opt.start_decay_steps optim.decay_steps = opt.decay_steps optim.betas = [opt.adam_beta1, opt.adam_beta2] optim.adagrad_accum = opt.adagrad_accumulator_init optim.decay_method = opt.decay_method optim.warmup_steps = opt.warmup_steps optim.model_size = opt.dec_rnn_size else: optim = Optimizer(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.dec_rnn_size) # Stage 1: # Essentially optim.set_parameters (re-)creates and optimizer using # model.paramters() as parameters that will be stored in the # optim.optimizer.param_groups field of the torch optimizer class. # Importantly, this method does not yet load the optimizer state, as # essentially it builds a new optimizer with empty optimizer state and # parameters from the model. optim.set_parameters(model.named_parameters()) if opt.train_from and (opt.reset_optim in ['none', 'keep_states']): # Stage 2: In this stage, which is only performed when loading an # optimizer from a checkpoint, we load the saved_optimizer_state_dict # into the re-created optimizer, to set the optim.optimizer.state # field, which was previously empty. For this, we use the optimizer # state saved in the "saved_optimizer_state_dict" variable for # this purpose. # See also: https://github.com/pytorch/pytorch/issues/2830 optim.optimizer.load_state_dict(saved_optimizer_state_dict) # Convert back the state values to cuda type if applicable if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # We want to make sure that indeed we have a non-empty optimizer state # when we loaded an existing model. This should be at least the case # for Adam, which saves "exp_avg" and "exp_avg_sq" state # (Exponential moving average of gradient and squared gradient values) if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim