def setup_optimizer(cls, args, params, **kwargs): """ Args: args (omegaconf.DictConfig): fairseq args params (iterable): iterable of parameters to optimize """ fp32_optimizer = optimizers.setup_optimizer(args, params) return cls(args, params, fp32_optimizer, **kwargs)
def setup_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ fp16_optimizer = optimizers.setup_optimizer(args, params) return cls(args, params, fp16_optimizer)
def setup_optimizer(cls, args, params): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not args['common'].get('fp16_no_flatten_grads', False) if args['common'].get('bf16', False): flatten = False # mixed precision is faster on TPUs without flat grads fp32_params = cls.build_fp32_params(args, params, flatten=flatten) if flatten: fp32_optimizer = optimizers.setup_optimizer(args, [fp32_params]) else: fp32_optimizer = optimizers.setup_optimizer(args, fp32_params) if flatten and not fp32_optimizer.supports_flat_params: raise RuntimeError( 'chosen optimizer does not support flat params, ' 'please set --fp16-no-flatten-grads') return cls(args, params, fp32_optimizer, fp32_params)
def _setup_optimizer(self, params=None): """ params: None, set learning rates, decay factors for parameters """ if params is None: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args['common']['fp16'] or self.args['common'][ 'bf16'] or self.args['common'].get('amp', False): if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: LOGGER.info( "NOTE: your device does NOT support faster training with fp16 or --amp, " "please switch to FP32 which is likely to be faster") if self.args['common']['memory_efficient_fp16'] or self.args[ 'common']['memory_efficient_bf16']: self._optimizer = optimizers.MemoryEfficientFP16Optimizer.setup_optimizer( self.args, params) elif self.args['common'].get('amp', False): self._optimizer = optimizers.AMPOptimizer.setup_optimizer( self.args, params) else: self._optimizer = optimizers.FP16Optimizer.setup_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: LOGGER.info( "NOTE: your device may support faster training with fp16 or --amp" ) self._optimizer = optimizers.setup_optimizer(self.args, params) if self.args['optimization']['use_bmuf']: self._optimizer = optimizers.NccBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_schedulers.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _setup_optimizer(self): no_decay = ['bias', 'LayerNorm.weight'] params = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: LOGGER.info( "NOTE: your device may support faster training with --fp16") self._optimizer = optimizers.setup_optimizer(self.args, params) if self.args['optimization']['use_bmuf']: self._optimizer = optimizers.NccBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_schedulers.build_lr_scheduler( self.args, self.optimizer) if getattr(self._lr_scheduler, 'period', None) == -1: import math self._lr_scheduler.period = \ self.args['optimization']['max_epoch'] * \ math.ceil(len(self.task.dataset('train')) / self.args['dataset']['max_sentences']) LOGGER.warning('Update steps of {} has not been set and, therefore, set {} as default.'. \ format(self.lr_scheduler.__class__.__name__, self._lr_scheduler.period)) self._lr_scheduler.step_update(0)