Пример #1
0
 def setup_optimizer(cls, args, params, **kwargs):
     """
     Args:
         args (omegaconf.DictConfig): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp32_optimizer = optimizers.setup_optimizer(args, params)
     return cls(args, params, fp32_optimizer, **kwargs)
Пример #2
0
 def setup_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp16_optimizer = optimizers.setup_optimizer(args, params)
     return cls(args, params, fp16_optimizer)
Пример #3
0
 def setup_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     flatten = not args['common'].get('fp16_no_flatten_grads', False)
     if args['common'].get('bf16', False):
         flatten = False  # mixed precision is faster on TPUs without flat grads
     fp32_params = cls.build_fp32_params(args, params, flatten=flatten)
     if flatten:
         fp32_optimizer = optimizers.setup_optimizer(args, [fp32_params])
     else:
         fp32_optimizer = optimizers.setup_optimizer(args, fp32_params)
     if flatten and not fp32_optimizer.supports_flat_params:
         raise RuntimeError(
             'chosen optimizer does not support flat params, '
             'please set --fp16-no-flatten-grads')
     return cls(args, params, fp32_optimizer, fp32_params)
Пример #4
0
    def _setup_optimizer(self, params=None):
        """
        params: None, set learning rates, decay factors for parameters
        """
        if params is None:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))

        if self.args['common']['fp16'] or self.args['common'][
                'bf16'] or self.args['common'].get('amp', False):
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                LOGGER.info(
                    "NOTE: your device does NOT support faster training with fp16 or --amp, "
                    "please switch to FP32 which is likely to be faster")
            if self.args['common']['memory_efficient_fp16'] or self.args[
                    'common']['memory_efficient_bf16']:
                self._optimizer = optimizers.MemoryEfficientFP16Optimizer.setup_optimizer(
                    self.args, params)
            elif self.args['common'].get('amp', False):
                self._optimizer = optimizers.AMPOptimizer.setup_optimizer(
                    self.args, params)
            else:
                self._optimizer = optimizers.FP16Optimizer.setup_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                LOGGER.info(
                    "NOTE: your device may support faster training with fp16 or --amp"
                )
            self._optimizer = optimizers.setup_optimizer(self.args, params)

        if self.args['optimization']['use_bmuf']:
            self._optimizer = optimizers.NccBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_schedulers.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Пример #5
0
    def _setup_optimizer(self):
        no_decay = ['bias', 'LayerNorm.weight']
        params = [{
            'params': [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
            LOGGER.info(
                "NOTE: your device may support faster training with --fp16")
        self._optimizer = optimizers.setup_optimizer(self.args, params)

        if self.args['optimization']['use_bmuf']:
            self._optimizer = optimizers.NccBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_schedulers.build_lr_scheduler(
            self.args, self.optimizer)
        if getattr(self._lr_scheduler, 'period', None) == -1:
            import math
            self._lr_scheduler.period = \
                self.args['optimization']['max_epoch'] * \
                math.ceil(len(self.task.dataset('train')) / self.args['dataset']['max_sentences'])
            LOGGER.warning('Update steps of {} has not been set and, therefore, set {} as default.'. \
                           format(self.lr_scheduler.__class__.__name__, self._lr_scheduler.period))
        self._lr_scheduler.step_update(0)