예제 #1
0
    def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
        zero_checkpoint_name = self._get_zero_ckpt_name(load_dir, tag)

        if not os.path.exists(zero_checkpoint_name):
            logger.warn(
                'Client provided checkpoint load path: {} does not exist ... skip checkpoint load'
                .format(zero_checkpoint_name))
            return None

        zero_sd = torch.load(zero_checkpoint_name, map_location='cpu')
        self.optimizer.load_state_dict(zero_sd['optimizer_state_dict'],
                                       load_optimizer_states=load_optimizer_states)
        logger.info('loading zero checkpoint {}'.format(zero_checkpoint_name))
예제 #2
0
    def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom,
                             decay_mom_rate, last_batch_iteration):
        if 'betas' not in optimizer.defaults:
            optimizer_name = type(optimizer).__name__
            logger.warn(
                f"cycle_momentum is disabled because optimizer {optimizer_name} does not support momentum, no betas attribute in defaults"
            )
            self.cycle_momentum = False
            return

        self.decay_mom_rate = decay_mom_rate
        self.min_moms = [(cycle_min_mom, 0.99)] * len(optimizer.param_groups)
        self.max_moms = [(cycle_max_mom, 0.99)] * len(optimizer.param_groups)

        if last_batch_iteration == -1:
            for momentum, group in zip(self.min_moms, optimizer.param_groups):
                group['betas'] = momentum
예제 #3
0
    def _load_checkpoint(self,
                         load_dir,
                         tag,
                         load_module_strict=True,
                         load_optimizer_states=True,
                         load_lr_scheduler_states=True):

        load_path = self._get_ckpt_name(load_dir, tag)

        if not os.path.exists(load_path):
            logger.warn(
                'Client provided checkpoint load path: {} does not exist ... skip checkpoint load'
                .format(load_path))
            return None, None

        logger.info('Loading checkpoint: {}'.format(load_path))
        checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)

        self.load_module_state_dict(state_dict=checkpoint['module'],
                                    strict=load_module_strict)
        if not self.zero_optimization():
            self.optimizer.load_state_dict(checkpoint['optimizer'],
                                           load_optimizer_states=load_optimizer_states)

        if load_lr_scheduler_states and self.lr_scheduler is not None:
            self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])

        self.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
        self.global_steps = checkpoint['global_steps']
        self.skipped_steps = checkpoint['skipped_steps']
        deepspeed_states = [
            'module',
            'optimizer',
            'lr_scheduler',
            'csr_tensor_module_names',
            'skipped_steps',
            'global_steps'
        ]
        client_state = {
            key: value
            for key,
            value in checkpoint.items() if not key in deepspeed_states
        }

        return load_path, client_state