def load(self, save_path, model: torch.nn.Module = None, optimizer: Optimizer = None): """Loads saved model from file Args: save_path: Path to saved model (.pth). If a directory is provided instead, model-best.pth is used model: Torch model to restore weights to optimizer: Optimizer """ if os.path.isdir(save_path): save_path = os.path.join(save_path, 'model-best.pth') state = torch.load(save_path) step = 0 if 'step' in state: step = state['step'] if 'state_dict' in state and model is not None: model.load_state_dict(state['state_dict']) if 'optimizer' in state and optimizer is not None: optimizer.load_state_dict(state['optimizer']) self._logger.info('Loaded models from {}'.format(save_path)) return step
def lr_find(model: UNet, data_loader, optimizer: Optimizer, criterion, use_gpu, min_lr=0.0001, max_lr=0.1): # Save model and optimizer states to revert model_state = model.state_dict() optimizer_state = optimizer.state_dict() losses = [] lrs = [] scheduler = CyclicExpLR(optimizer, min_lr, max_lr, step_size_up=100, mode='triangular', cycle_momentum=True) model.train() for i, (data, target, class_ids) in enumerate(data_loader): data, target = data, target if use_gpu: data = data.cuda() target = target.cuda() optimizer.zero_grad() output_raw = model(data) # This step is specific for this project output = torch.zeros(output_raw.shape[0], 1, output_raw.shape[2], output_raw.shape[3]) if use_gpu: output = output.cuda() # This step is specific for this project for idx, (raw_o, class_id) in enumerate(zip(output_raw, class_ids)): output[idx] = raw_o[class_id - 1] loss = criterion(output, target) loss.backward() current_lr = optimizer.param_groups[0]['lr'] # Stop if lr stopped increasing if len(lrs) > 0 and current_lr < lrs[-1]: break lrs.append(current_lr) losses.append(loss.item()) optimizer.step() scheduler.step() # Plot in log scale plt.plot(lrs, losses) plt.xscale('log') plt.show() model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state)
def load_optimizer(path: str, optimizer: Optimizer, map_location: Union[None, str] = "cpu"): """ Load the state dict into an optimizer from a given file. :param path: the path to the pth file to load the state dict from :param optimizer: the optimizer to load the state dict into :param map_location: the location to map the values to when loading the :return: the epoch saved in the file, if any """ model_dict = torch.load(path, map_location=map_location) optimizer.load_state_dict(model_dict["optimizer"])
def load_ckpt(checkpoint_path: str, model: nn.Module, optim: optimizer.Optimizer) -> Tuple[int, float]: """Loads training checkpoint. :param checkpoint_path: path to checkpoint :param model: model to update state :param optim: optimizer to update state :return tuple of starting epoch id, starting step id, best checkpoint score """ checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint[_MODEL_STATE_DICT]) optim.load_state_dict(checkpoint[_OPTIMIZER_STATE_DICT]) start_epoch_id = checkpoint[_EPOCH] + 1 best_score = checkpoint[_BEST_SCORE] return start_epoch_id, best_score
def resume_checkpoint( model: nn.Module = None, optimizer: optim.Optimizer = None, scheduler: sche._LRScheduler = None, exp_name: str = "", load_path: str = "", mode: str = "all", ): """ 从保存节点恢复模型 Args: model (nn.Module): model object optimizer (optim.Optimizer): optimizer object scheduler (sche._LRScheduler): scheduler object exp_name (str): exp_name load_path (str): 模型存放路径 mode (str): 选择哪种模型恢复模式: - 'all': 回复完整模型,包括训练中的的参数; - 'onlynet': 仅恢复模型权重参数 Returns mode: 'all' start_epoch; 'onlynet' None """ if os.path.exists(load_path) and os.path.isfile(load_path): construct_print(f"Loading checkpoint '{load_path}'") checkpoint = torch.load(load_path) if mode == "all": if exp_name == checkpoint["arch"]: start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["net_state"]) optimizer.load_state_dict(checkpoint["opti_state"]) scheduler.load_state_dict(checkpoint["sche_state"]) construct_print(f"Loaded '{load_path}' " f"(will train at epoch" f" {checkpoint['epoch']})") return start_epoch else: raise Exception(f"{load_path} does not match.") elif mode == "onlynet": model.load_state_dict(checkpoint) construct_print(f"Loaded checkpoint '{load_path}' " f"(only has the model's weight params)") else: raise NotImplementedError else: raise Exception(f"{load_path}路径不正常,请检查")
def load_checkpoint(model: nn.Module, optim: optimizer.Optimizer, checkpoint_path="./result/fr_en/checkpoint.tar") -> Tuple[int, int, float, float]: """Loads training checkpoint. :param checkpoint_path: path to checkpoint :param model: model to update state :param optim: optimizer to update state :return tuple of starting epoch id, starting step id, best checkpoint score """ checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint[_MODEL_STATE_DICT]) optim.load_state_dict(checkpoint[_OPTIMIZER_STATE_DICT]) start_epoch_id = checkpoint[_EPOCH] + 1 step = checkpoint[_STEP] + 1 best_score = checkpoint[_BEST_SCORE] loss = checkpoint[_LOSS] return start_epoch_id, step, best_score, loss
def load_checkpoint(checkpoint_dir: str, model: nn.Module, optim: optimizer.Optimizer) -> Tuple[int, int, float]: """Loads training checkpoint. :param checkpoint_path: path to checkpoint :param model: model to update state :param optim: optimizer to update state :return tuple of starting epoch id, starting step id, best checkpoint score """ if not os.path.exists(checkpoint_dir): raise ("File doesn't exist {}".format(checkpoint_dir)) checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint.tar') checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint[_MODEL_STATE_DICT]) optim.load_state_dict(checkpoint[_OPTIMIZER_STATE_DICT]) start_epoch_id = checkpoint[_EPOCH] + 1 step = checkpoint[_STEP] + 1 best_score = checkpoint[_BEST_SCORE] return start_epoch_id, step, best_score
def load_model(checkpoint: str, model: Module, optimizer: Optimizer) -> Tuple[int, int, List[dict]]: """ If fresh model then ``iteration == -1``. :return: (epoch, iteration, log_stats) """ log_stats: List[dict] epoch, iteration, log_stats = 0, -1, [] if path.isfile(checkpoint): _checkpoint = tr.load(checkpoint) model.load_state_dict(_checkpoint['model_state_dict']) optimizer.load_state_dict(_checkpoint['optimizer_state_dict']) model.eval() epoch = _checkpoint.get('epoch', epoch) iteration = _checkpoint.get('iteration', iteration) log_stats = _checkpoint.get('log_stats', log_stats) if not isinstance(log_stats, list) or not isinstance(epoch, int) or not isinstance(iteration, int): raise RuntimeError('Loaded log_stats should be a list, epoch and iteration should be int.') return epoch, iteration, log_stats
def resume_checkpoint( model: nn.Module = None, optimizer: optim.Optimizer = None, scheduler: sche._LRScheduler = None, amp=None, exp_name: str = "", load_path: str = "", mode: str = "all", ): """ 从保存节点恢复模型 Args: model (nn.Module): model object optimizer (optim.Optimizer): optimizer object scheduler (sche._LRScheduler): scheduler object amp (): apex.amp exp_name (str): exp_name load_path (str): 模型存放路径 mode (str): 选择哪种模型恢复模式: - 'all': 回复完整模型,包括训练中的的参数; - 'onlynet': 仅恢复模型权重参数 Returns mode: 'all' start_epoch; 'onlynet' None """ if os.path.exists(load_path) and os.path.isfile(load_path): construct_print(f"Loading checkpoint '{load_path}'") checkpoint = torch.load(load_path) if mode == "all": if exp_name and exp_name != checkpoint["arch"]: # 如果给定了exp_name,那么就必须匹配对应的checkpoint["arch"],否则不作要求 raise Exception( f"We can not match {exp_name} with {load_path}.") start_epoch = checkpoint["epoch"] if hasattr(model, "module"): model.module.load_state_dict(checkpoint["net_state"]) else: model.load_state_dict(checkpoint["net_state"]) optimizer.load_state_dict(checkpoint["opti_state"]) scheduler.load_state_dict(checkpoint["sche_state"]) if checkpoint.get("amp_state", None): if amp: amp.load_state_dict(checkpoint["amp_state"]) else: construct_print("You are not using amp.") else: construct_print("The state_dict of amp is None.") construct_print(f"Loaded '{load_path}' " f"(will train at epoch" f" {checkpoint['epoch']})") return start_epoch elif mode == "onlynet": if hasattr(model, "module"): model.module.load_state_dict(checkpoint) else: model.load_state_dict(checkpoint) construct_print(f"Loaded checkpoint '{load_path}' " f"(only has the model's weight params)") else: raise NotImplementedError else: raise Exception(f"{load_path}路径不正常,请检查")
def resume_checkpoint( model: nn.Module = None, optimizer: optim.Optimizer = None, amp=None, exp_name: str = "", load_path: str = "", mode: str = "all", local_rank: int = 0, ): """ 从保存节点恢复模型 Args: model (nn.Module): model object optimizer (optim.Optimizer): optimizer object scheduler (sche._LRScheduler): scheduler object amp (): apex.amp exp_name (str): exp_name load_path (str): 模型存放路径 mode (str): 选择哪种模型恢复模式: - 'all': 回复完整模型,包括训练中的的参数; - 'onlynet': 仅恢复模型权重参数 local_rank (int): 指定权重加载的目标GPU Returns mode: 'all' start_epoch; 'onlynet' None """ if os.path.exists(load_path) and os.path.isfile(load_path): construct_print(f"Loading checkpoint '{load_path}'") checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{local_rank}"}) if mode == "all": if exp_name == checkpoint["arch"]: start_epoch = checkpoint["epoch"] # "net_state": model.module.state_dict() # if hasattr(model, "module") else model.state_dict(), if hasattr(model, "module"): model.module.load_state_dict(checkpoint["net_state"]) else: model.load_state_dict(checkpoint["net_state"]) optimizer.load_state_dict(checkpoint["opti_state"]) if checkpoint["amp_state"]: if amp: amp.load_state_dict(checkpoint["amp_state"]) else: construct_print("You are not using amp.") else: construct_print("The state_dict of amp is None.") construct_print(f"Loaded '{load_path}' " f"(will train at epoch" f" {checkpoint['epoch']})") return start_epoch else: raise Exception(f"{load_path} does not match.") elif mode == "onlynet": if hasattr(model, "module"): model.module.load_state_dict(checkpoint["net_state"]) else: model.load_state_dict(checkpoint["net_state"]) construct_print(f"Loaded checkpoint '{load_path}' " f"(only has the model's weight params)") else: raise NotImplementedError else: raise Exception(f"{load_path}路径不正常,请检查")
def load_torch_state(model: nn.Module, optimizer: Optimizer, path: str, device_id: int): checkpoint = torch.load(path, map_location=f"cuda:{device_id}") model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict'])