def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True): zero_checkpoint_name = self._get_zero_ckpt_name(load_dir, tag) if not os.path.exists(zero_checkpoint_name): logger.warn( 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' .format(zero_checkpoint_name)) return None zero_sd = torch.load(zero_checkpoint_name, map_location='cpu') self.optimizer.load_state_dict(zero_sd['optimizer_state_dict'], load_optimizer_states=load_optimizer_states) logger.info('loading zero checkpoint {}'.format(zero_checkpoint_name))
def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration): if 'betas' not in optimizer.defaults: optimizer_name = type(optimizer).__name__ logger.warn( f"cycle_momentum is disabled because optimizer {optimizer_name} does not support momentum, no betas attribute in defaults" ) self.cycle_momentum = False return self.decay_mom_rate = decay_mom_rate self.min_moms = [(cycle_min_mom, 0.99)] * len(optimizer.param_groups) self.max_moms = [(cycle_max_mom, 0.99)] * len(optimizer.param_groups) if last_batch_iteration == -1: for momentum, group in zip(self.min_moms, optimizer.param_groups): group['betas'] = momentum
def _load_checkpoint(self, load_dir, tag, load_module_strict=True, load_optimizer_states=True, load_lr_scheduler_states=True): load_path = self._get_ckpt_name(load_dir, tag) if not os.path.exists(load_path): logger.warn( 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' .format(load_path)) return None, None logger.info('Loading checkpoint: {}'.format(load_path)) checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) self.load_module_state_dict(state_dict=checkpoint['module'], strict=load_module_strict) if not self.zero_optimization(): self.optimizer.load_state_dict(checkpoint['optimizer'], load_optimizer_states=load_optimizer_states) if load_lr_scheduler_states and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) self.csr_tensor_module_names = checkpoint['csr_tensor_module_names'] self.global_steps = checkpoint['global_steps'] self.skipped_steps = checkpoint['skipped_steps'] deepspeed_states = [ 'module', 'optimizer', 'lr_scheduler', 'csr_tensor_module_names', 'skipped_steps', 'global_steps' ] client_state = { key: value for key, value in checkpoint.items() if not key in deepspeed_states } return load_path, client_state