示例#1
0
def save(epoch_idx: int, best_accuracy: float, gen_model: nn.Module,
         dis_model: nn.Module, gen_opt: torch.optim.Optimizer,
         dis_opt: torch.optim.Optimizer):
    checkpoint = {
        'epoch': epoch_idx,
        'best_acc': best_accuracy,
        'lr': scheduler.gen_lr,
        'generator': gen_model.state_dict(),
        'discriminator': dis_model.state_dict(),
        'gen_optimizer': gen_opt.state_dict(),
        'dis_optimizer': dis_opt.state_dict()
    }
    torch.save(checkpoint, ds.SAVE_DIR + 'weights/checkpoint')
 def save(self, model: nn.Module, step: int,
          optimizer: torch.optim.Optimizer, epoch: int, **kwargs):
     if self.prefix:
         checkpoint_path = os.path.join(self.dir,
                                        f"{self.prefix}_{str(step)}.pth")
     else:
         checkpoint_path = os.path.join(self.dir, str(step) + ".pth")
     save_state = {
         'epoch': epoch + 1,
         'optimizer': optimizer.state_dict(),
     }
     if self.only_weights:
         save_state['state_dict'] = model.state_dict()
     else:
         save_state['model'] = model
     save_state = dict(save_state, **kwargs)
     torch.save(save_state, checkpoint_path)
     print(f"Saved in {checkpoint_path}")
     popped = self.checkpoitns.append(checkpoint_path)
     if popped:
         try:
             os.remove(popped)
             print("removed")
         except OSError:
             pass
示例#3
0
def save_results(output_dir: str, model: nn.Module,
                 optimizer: torch.optim.Optimizer) -> None:
    if distributed.is_main_process():
        logger.info("Dump the last model")
        torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
        torch.save(optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
示例#4
0
def save_checkpoint(epoch: int, update: int, samples_seen: int,
                    model: torch.nn.Module, optimizer: torch.optim.Optimizer,
                    scheduler: Optional[ReduceLROnPlateau],
                    numpy_epoch_random_state: Tuple, train_loss: float,
                    best_valid_loss: float, best_valid_loss_index: int,
                    best_valid_acc: float, filename: str) -> None:
    torch.save(
        {
            "epoch":
            epoch,
            "update":
            update,
            "samples_seen":
            samples_seen,
            "model_state_dict":
            model.state_dict(),
            "optimizer_state_dict":
            optimizer.state_dict(),
            "scheduler_state_dict":
            scheduler.state_dict() if scheduler is not None else None,
            "train_loss":
            train_loss,
            "best_valid_loss":
            best_valid_loss,
            "best_valid_loss_index":
            best_valid_loss_index,
            "best_valid_acc":
            best_valid_acc,
            "numpy_epoch_random_state":
            numpy_epoch_random_state,
            "numpy_last_random_state":
            np.random.get_state(),
            "torch_last_random_state":
            torch.random.get_rng_state()
        }, filename)
示例#5
0
    def save_checkpoint(self,
                        model: torch.nn.Module,
                        optimizer: torch.optim.Optimizer = None,
                        is_best: bool = False):
        """
        Save checkpoint under the path.

        Parameters
        ----------
        model: ``torch.nn.Module``, required.
            The model to be saved
        optimizer: ``torch.optim.Optimizer``, optional.
            The optimizer to be saved (if provided)
        is_best: bool, optional, (default=False).
            If set false, would only be saved as ``checkpoint_#counter.th``; otherwise, would also be saved as ``best.th``
        """
        s_dict = {'model': model.state_dict()}
        if optimizer is not None:
            s_dict['optimizer'] = optimizer.state_dict()

        if is_best:
            torch.save(s_dict, os.path.join(self.path, 'best.th'))

        torch.save(
            s_dict,
            os.path.join(self.path, 'checkpoint_{}.th'.format(self.counter)))
        self.counter += 1
        if self.counter > self.checkpoints_to_keep:
            os.remove(
                os.path.join(
                    self.path,
                    'checkpoint_{}.th'.format(self.counter -
                                              self.checkpoints_to_keep - 1)))
示例#6
0
    def save_model_and_optimizer_with_info(self, model: torch.nn.Module,
                                           optimizer: torch.optim.Optimizer,
                                           info: dict) -> None:
        """Save model and optimizer state dictionaries to file given epoch info

        This is called automatically during :func:`update_for_epoch`. Does not save if
        there is no directory to save to (i.e. ``self.state_dir is None``). Format
        strings from ``self.params`` are formatted with the values from `info` to
        construct the base names of each file

        Parameters
        ----------
        model : AcousticModel
        optimizer : torch.optim.Optimizer
        info : dict
            A dictionary with the entries "epoch", "es_resume_cd",
            "es_patience_cd", "rlr_resume_cd", "rlr_patience_cd", "lr",
            "train_met", "val_met", and any entries specified through
            :func:`add_entry`
        """
        if self.state_dir is None:
            return
        if not os.path.isdir(self.state_dir):
            os.makedirs(self.state_dir)
        model_basename = self.params.saved_model_fmt.format(**info)
        optimizer_basename = self.params.saved_optimizer_fmt.format(**info)
        torch.save(
            model.state_dict(),
            os.path.join(self.state_dir, model_basename),
        )
        torch.save(
            optimizer.state_dict(),
            os.path.join(self.state_dir, optimizer_basename),
        )
示例#7
0
def train(model: torch.nn.Module, data_loader, epoch: int, num_iterations: int,
          batch_size: int, optimizer: torch.optim.Optimizer,
          criterion: torch.nn.Module, device: torch.device):
    model.train()
    epoch_loss = 0

    # TODO pass epoch to loader
    for batch_id, data in enumerate(data_loader(num_iterations, batch_size)):
        inputs, targets = data[0].to(device), data[1].to(device)

        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs, targets)
        print(f'Loss on batch: {loss}')

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

        if (batch_id % 1000) == 0:
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'batch_loss': loss.item()
                },
                os.path.join(CHECKPOINTS_DIR,
                             f'model_LRW_train_{batch_id}.tar'))

    return epoch_loss / num_iterations
def save_checkpoint(
    filename: str,
    epoch: int,
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    f1_score: float,
    vocabs: Dict[str, Any],
    cfg: DictConfig,
) -> None:
    model.cpu()
    path = os.path.join("checkpoints", filename)
    torch.save(
        {
            "cfg": conf2dict(cfg),
            "epoch": epoch,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "f1_score": f1_score,
            "vocabs": vocabs,
        },
        path,
    )
    log.info("Checkpoint saved to %s" % path)
    device, _ = get_device()
    model.to(device)
示例#9
0
 def save(self, best: bool, epoch: int, optimizer: torch.optim.Optimizer):
     filename = 'best.tar' if best else 'last.tar'
     print("Saving model as {}...".format(filename), end=' ')
     torch.save({'epoch': epoch,
                 'model_state_dict': self.model.state_dict(),
                 'optimizer_state_dict': optimizer.state_dict()},
                os.path.join(self.config.checkpoint_dir, filename))
     print("Model saved.")
示例#10
0
def save_checkpoint(model: torch.nn.Module, optim: torch.optim.Optimizer,
                    config: Dict[str, Any], path: Path) -> None:
    torch.save(
        {
            'model': model.state_dict(),
            'optim': optim.state_dict(),
            'config': config
        }, str(path))
示例#11
0
def save(model: torch.nn.Module, optimizer: torch.optim.Optimizer, epoch: int,
         path: str):
    params = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(params, path)
示例#12
0
def save_train_state(epoch: int, model: nn.Module, optimizer: torch.optim.Optimizer, scheduler, best_score: float,
                     file_path):
    torch.save({
        'epoch': epoch,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'best_score': best_score,
    }, file_path)
示例#13
0
def make_checkpoint(epoch: int,
                    model: LSTM,
                    loss_function: Union[SplitCrossEntropyLoss,
                                         CrossEntropyLoss],
                    optimizer: torch.optim.Optimizer,
                    use_apex=False,
                    amp=None,
                    prior: Union[str, nn.Module] = None,
                    **kwargs):
    """
    Packages network parameters into a picklable dictionary containing keys
    * epoch: current epoch
    * model: the network model
    * loss: the loss function
    * optimizer: the torch optimizer
    * use_apex: use nvidia apex for AMP or not
    * amp: the nvidia AMP object

    Parameters
    ----------
    epoch : int
        The current epoch of training
    model : LSTM
        The network model
    loss_function : SplitCrossEntropyLoss or CrossEntropyLoss
        The loss function
    optimizer : torch.optim.optimizer
        The optimizer function
    use_apex : bool
        If mixed precision mode is activated. If this is true, the `amp` argument should be supplied as well.
        The default value is False.
    amp :
        The nvidia apex amp object, should contain information about state of training
    kwargs :
        Not used

    Returns
    -------
    checkpoint: dict
        A picklable dict containing the checkpoint

    """
    checkpoint = {
        'epoch': epoch,
        'model': model.state_dict(),
        'loss': loss_function.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    if use_apex:
        checkpoint['amp'] = amp.state_dict()

    if prior is not None and not isinstance(prior, str):
        checkpoint['prior'] = prior

    return checkpoint
示例#14
0
def checkpoint_model(
    net: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    memory: deque,
    path: str,
):
    torch.save({
        'model_state_dict': net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'memory': memory,
    }, path)
示例#15
0
def dump_optimizer_state(optimizer: torch.optim.Optimizer):
    with torch.no_grad():
        flat_metadata, flat_tensors = [], []
        for elem in nested_flatten(optimizer.state_dict()):
            if isinstance(elem, torch.Tensor):
                flat_metadata.append(
                    dict(type='tensor', index=len(flat_tensors)))
                flat_tensors.append(elem.cpu())
            else:
                flat_metadata.append(dict(type='value', value=elem))
        return flat_metadata, flat_tensors
示例#16
0
def save_checkpoint(model: torch.nn.Module, optimizer: torch.optim.Optimizer,
                    path: str, epoch: int) -> None:
    """Save a training checkpoint."""
    checkpoint_path = _get_checkpoint_path(path, epoch)
    print(f"Saving checkpoint to {checkpoint_path}", flush=True)
    torch.save(
        {
            "epoch": epoch + 1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }, checkpoint_path)
示例#17
0
def save_checkpoint(path: str, model: nn.Module,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: "learning rate policy", epoch: int) -> None:
    path = path + "/model-optim-lr_sch-epoch.tar"
    torch.save(
        {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch
        }, path)
 def save_model(self, model: torch.nn.Module,
                optimizer: torch.optim.Optimizer, epoch: int, acc: float,
                is_best: bool):
     path = self.__get_model_path(self.__get_model_filename(epoch, is_best))
     with open(path, 'w') as f:
         torch.save(
             {
                 'epoch': epoch + 1,
                 'state_dict': model.state_dict(),
                 'acc': acc,
                 'optimizer': optimizer.state_dict()
             }, f)
示例#19
0
def load_optimizer_state(optimizer: torch.optim.Optimizer, flat_metadata: Dict,
                         flat_tensors: Sequence[torch.Tensor]):
    flat_optimizer_state = []
    for elem in flat_metadata:
        if elem.get('type') == 'tensor' and isinstance(elem.get('index'), int):
            flat_optimizer_state.append(flat_tensors[elem['index']])
        elif elem.get('type') == 'value' and 'value' in elem:
            flat_optimizer_state.append(elem['value'])
    with torch.no_grad():
        return optimizer.load_state_dict(
            nested_pack(flat_optimizer_state,
                        structure=optimizer.state_dict()))
示例#20
0
def dump_optimizer_state(opt: torch.optim.Optimizer):
    """ Convert optimizer state into a format of DecentralizedAverager's get_current_state/load_state_from_peers """
    with torch.no_grad():
        flat_metadata, flat_tensors = [], []
        for elem in nested_flatten(opt.state_dict()):
            if isinstance(elem, torch.Tensor):
                flat_metadata.append(
                    dict(type='tensor', index=len(flat_tensors)))
                flat_tensors.append(elem.cpu())
            else:
                flat_metadata.append(dict(type='value', value=elem))
        return flat_metadata, flat_tensors
示例#21
0
def save_checkpoint(
    command_history: CommandHistory,
    epoch: int,
    model: torch.jit.ScriptModule,
    optim: torch.optim.Optimizer,
    game_params: GameParams,
    model_params: ModelParams,
    optim_params: OptimParams,
    simulation_params: SimulationParams,
    execution_params: ExecutionParams,
    executor: ThreadPoolExecutor = None,
) -> None:
    checkpoint_dir = execution_params.checkpoint_dir
    save_uncompressed = execution_params.save_uncompressed
    checkpoint_name = f"checkpoint_{epoch}"
    checkpoint = {
        "command_history": command_history,
        "epoch": epoch,
        "model_state_dict": {
            k: v.cpu().clone()
            if isinstance(v, torch.Tensor) else copy.deepcopy(v)
            for k, v in model.state_dict().items()
        },
        "optim_state_dict": {
            k: v.cpu().clone()
            if isinstance(v, torch.Tensor) else copy.deepcopy(v)
            for k, v in optim.state_dict().items()
        },
        "game_params": game_params,
        "model_params": model_params,
        "optim_params": optim_params,
        "simulation_params": simulation_params,
        "execution_params": execution_params,
    }

    def saveit():
        nonlocal save_uncompressed
        nonlocal checkpoint
        nonlocal checkpoint_dir
        if save_uncompressed:
            torch.save(checkpoint, checkpoint_dir / f"{checkpoint_name}.pt")
        else:
            # with zipfile.ZipFile(Path(checkpoint_dir) / f"{checkpoint_name}.zip", "w", allowZip64=True) as z:
            #    with z.open(f"{checkpoint_name}.pt", "w", force_zip64=True) as f:
            #        torch.save(checkpoint, f)
            with gzip.open(checkpoint_dir / f"{checkpoint_name}.pt.gz",
                           "wb") as f:
                torch.save(checkpoint, f)

    if executor is not None:
        return executor.submit(saveit)
    else:
        saveit()
示例#22
0
def save_model_checkpoint(model: torch.nn.Module,
                          optimizer: torch.optim.Optimizer,
                          criterion,
                          epochs,
                          replay_buffer,
                          filename='./saved_model.pth'):
    torch.save(
        {
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion,
            'replay_buffer': replay_buffer
        }, filename)
示例#23
0
 def _save_model(self, optimizer: torch.optim.Optimizer, epoch: int,
                 loss: float, best_bleu: float) -> None:
     """
     Save model.
     """
     state = {
         "epoch": epoch,
         "best_bleu_score": best_bleu,
         "loss": loss,
         "model_state": self.model.state_dict(),
         "optimizer_state": optimizer.state_dict(),
     }
     model_path = os.path.join(self.experiment_path, "best_model.pt")
     torch.save(state, model_path)
     self.logger.info(f"New best bleu! Model saved to {model_path}")
 def save_model(self, model: torch.nn.Module,
                optimizer: torch.optim.Optimizer, epoch: int, acc: float,
                is_best: bool):
     path = self.__get_model_path(self.__get_model_filename(epoch, is_best))
     os.makedirs(os.path.dirname(path), exist_ok=True)
     with open(path, 'bw') as f:
         torch.save(
             {
                 'epoch': epoch + 1,
                 'state_dict': model.state_dict(),
                 'acc': acc,
                 'optimizer': optimizer.state_dict()
             }, f)
         if is_best:
             self.save_model(model, optimizer, epoch, acc, False)
示例#25
0
def saveModel ( model_filename: str, schedulers: typing.List, epoch: int, network: torch.nn.Module,
                optimizer: torch.optim.Optimizer, validationLoss: typing.List[float], trainingLoss: typing.List[float],
                trainingDifference: typing.List[typing.Tuple[float, float]],
                validationDifference: typing.List[typing.Tuple[float, float]],
                uncertainty: typing.List[typing.Tuple[float, float]],
                testingDifference: typing.List[typing.Tuple[float, float]],
                anees: typing.List[typing.Tuple[float, float]] ) -> None:
	"""

	Saves the given model to the given file. Also saves the scheduler, last epoch, optimizer, and previous losses.

	:param model_filename: The file to save the model to.
	:param schedulers: The schedulers to save.
	:param epoch: The last epoch that the model was trained on.
	:param network: The network that is being saved.
	:param optimizer: The optimizer that is to be saved with the network.
	:param validationLoss: The history of validation losses.
	:param trainingLoss: The history of training losses.
	:param trainingDifference: The training differences over the epochs.
	:param validationDifference: The validation differences over the epochs.
	:param uncertainty: The uncertainty history of the network.
	:param testingDifference: The testing differences of the network over the epochs.
	:param anees: The history of Average Normalized Error Estimate Squared the network has had.
	"""

	# Don't save if we don't want to.
	if Config.getArgs ().dont_save:
		return

	# Save network to file.
	if "{}" in model_filename:
		model_filename = model_filename.format ( Config.version, epoch, Config.getArgs ().model_number )

	Logger.log ( "Saving model to " + model_filename + ".", logger = "min" )
	saveCheckpoint ( filepath = model_filename,
	                 currModel = { "model":                network.state_dict (),
	                               "epoch":                epoch,
	                               "optimizer":            optimizer.state_dict (),
	                               "schedulers":           schedulers,
	                               "version":              Config.version,
	                               "trainingLoss":         trainingLoss,
	                               "validationLoss":       validationLoss,
	                               "validationDifference": validationDifference,
	                               "trainingDifference":   trainingDifference,
	                               "uncertainty":          uncertainty,
	                               "testingDifference":    testingDifference,
	                               "anees":                anees
	                               } )
示例#26
0
def save_checkpoint(
    epoch: int,
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    models_path: Path,
    exp_name: str,
) -> None:
    save_state = {
        "epoch": epoch + 1,  # increment epoch (to not repeat then resume)
        "state_dict": get_state_dict(model, unwrap_model),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(
        save_state,
        f"{models_path}/{exp_name}.pth",
    )
示例#27
0
def save(epoch: int, model, optimizer: torch.optim.Optimizer,
         scheduler: torch.optim.lr_scheduler, config):
    """
    Pickles the models to hdd
    """
    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
    out_dir = config.output_dir
    save_name = os.path.join(out_dir, 'epoch_{}_{}.pth'.format(epoch, now))
    save_dict = {
        'epoch': epoch,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
    }
    torch.save(save_dict, save_name)
    print("Saved the model to hdd")
    def run_model_selection(self, net: torch.nn.Module, optimizer: torch.optim.Optimizer,
                            monitor_metrics: dict, epoch: int):

        # take the mean over all selection criteria in each epoch
        non_nan_scores = np.mean(np.array([[0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][sc]] for sc in self.cf.model_selection_criteria]), 0)
        epochs_scores = [ii for ii in non_nan_scores[1:]]
        # ranking of epochs according to model_selection_criterion
        epoch_ranking = np.argsort(epochs_scores, kind="stable")[::-1] + 1 #epochs start at 1
        # if set in configs, epochs < min_save_thresh are discarded from saving process.
        epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh]

        # check if current epoch is among the top-k epochs.
        if epoch in epoch_ranking[:self.cf.save_n_models]:

            save_dir = os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch))
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)

            torch.save(net.state_dict(), os.path.join(save_dir, 'params.pth'))
            with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
                pickle.dump(monitor_metrics, handle)
            # save epoch_ranking to keep info for inference.
            np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
            np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])

            self.logger.info(
                "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch)))
            # delete params of the epoch that just fell out of the top-k epochs.
            for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_checkpoint' in ii]:
                if se in epoch_ranking[self.cf.save_n_models:]:
                    subprocess.call('rm -rf {}'.format(os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(se))), shell=True)
                    self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se)))

        state = {
            'epoch': epoch,
            'state_dict': net.state_dict(),
            'optimizer': optimizer.state_dict(),
        }

        # save checkpoint of current epoch.
        save_dir = os.path.join(self.cf.fold_dir, 'last_checkpoint'.format(epoch))
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        torch.save(state, os.path.join(save_dir, 'params.pth'))
        np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
        with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
            pickle.dump(monitor_metrics, handle)
示例#29
0
    def save_model(
        path: str,
        model: torch.nn.Module,
        optimizer: torch.optim.Optimizer
    ):
        """
        Save a torch model to given output path.
        :param path: The path.
        :param model: The model to save.
        :param optimizer: The optimizer to save
        """
        data_path = os.path.join(path, 'checkpoint.pth.tar')
        torch.save(model, os.path.join(path, 'model.pth'))

        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, data_path)
示例#30
0
    def __init__(self,
                 model: torch.nn.Module,
                 optimizer: torch.optim.Optimizer,
                 criterion: torch.nn.Module,
                 logger: Logger,
                 grad_clip: float = None):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.logger = logger
        self.grad_clip = grad_clip
        self.history = {'lrs': [], 'losses': []}

        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }, 'init_params.pt')