def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.args.optimizer != 'adam_cbn': params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) else: # selection from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian constraint_param = [] for m in self.model.modules(): if isinstance(m, Constraint_Lagrangian): constraint_param.extend(list(map(id, m.parameters()))) params_lag = list( filter(lambda p: id(p) in constraint_param, chain(self.model.parameters()))) params = list( filter( lambda p: id(p) not in constraint_param and p. requires_grad, chain(self.model.parameters(), self.criterion.parameters()))) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) # check cbn if self.args.optimizer != 'adam_cbn': self._optimizer = optim.build_optimizer(self.args, params) else: self._optimizer = optim.build_optimizer( self.args, params, params_lag) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.cfg.common.fp16 or self.cfg.common.bf16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if ( self.cfg.common.memory_efficient_fp16 or self.cfg.common.memory_efficient_bf16 ): self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params ) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info("NOTE: your device may support faster training with --fp16") self._optimizer = optim.build_optimizer(self.cfg.optimizer, params) if self.cfg.optimization.use_bmuf: self._optimizer = optim.FairseqBMUF( self.cfg.bmuf, self._optimizer, ) if self.cfg.distributed_training.zero_sharding == "os": if ( self.cfg.common.fp16 and not self.cfg.common.memory_efficient_fp16 and not self.cfg.common.memory_efficient_bf16 ) and not self.cfg.common.fp16_no_flatten_grads: raise ValueError( "ZeRO is incomptabile with fp16 and flattened grads. " "Please use --fp16-no-flatten-grads" ) else: optim.shard_(self._optimizer, self.data_parallel_process_group) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.cfg.lr_scheduler, self.optimizer, ) self._lr_scheduler.step_update(0)
def _build_optimizer(self): # params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) # ) params_dict = {} _default_manifold = Euclidean() for name, p in chain(self.model.named_parameters(), self.criterion.named_parameters()): if not p.requires_grad: continue if isinstance(p, (ManifoldParameter, ManifoldTensor)): _manifold = p.manifold else: _manifold = _default_manifold _manifold_name = _manifold.__class__.__name__ if not _manifold_name in params_dict: ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1)) coef = 1 if ref_grad == 1 else 1 #print(f"lr={self.args.lr}, ref={ref_grad.item()}") params_dict[_manifold_name] = dict( params=[], lr_rectifier=ref_grad.reciprocal().item() * coef) params_dict[_manifold_name]['params'].append(p) params = params_dict.values() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.ulmfit: params = [] multiplier_map = [] for n, p in self.model.named_parameters(): if p.requires_grad: params.append(p) param_name_split = n.split('.') if param_name_split[2] == 'lm_head': # last layer multiplier = 1. elif param_name_split[4].isdigit(): # encoder layer layer = int(param_name_split[4]) multiplier = self.decay_rate_lrc**-(self.num_layers - layer) else: # first layer multiplier = self.decay_rate_lrc**-(self.num_layers + 1) multiplier_map.append(multiplier) else: params = list( filter(lambda p: p.requires_grad, self.model.parameters())) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, params, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): if self.freeze_bart: for name, param in self.model.named_parameters(): if name.startswith('encoder') and name not in [ "encoder.structure_att.exparam", "encoder.structure_att.tp_linear.weight", "encoder.structure_att.tp_linear.bias", "encoder.structure_att.tc_linear.weight", "encoder.structure_att.tc_linear.bias", "encoder.structure_att.fi_linear.weight", "encoder.structure_att.bilinear._weight_matrix", "encoder.structure_att.bilinear._bias", "encoder.structure_att.fzlinear.weight", "encoder.structure_att.fzlinear.bias", "encoder.str_to_enc_linear.weight", "encoder.str_to_enc_linear.bias" ]: param.requires_grad = False print("Freezing parameters") params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): from itertools import chain if hasattr(self.args, 'encoder_layers'): params = get_decayed_param_groups( chain(self.model.named_parameters(), self.criterion.named_parameters()), num_layers=self.args.encoder_layers, weight_decay=self.args.weight_decay, weight_decay_exclude=self.args.weight_decay_exclude, freeze_encoder=self.args.freeze_encoder, freeze_embedding=self.args.freeze_embedding, lr=float(self.args.lr[0]), lr_decay=float(self.args.lr_decay), ) else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def setup_model_loss_criterion(args, rank, is_cuda): """ setup model, criterion and optimizer based on input args """ args.distributed_rank = rank distributed_utils.distributed_init(args) torch.manual_seed(1) model = Model(args.input_size, args.nb_classes) loss_fn = nn.CrossEntropyLoss() if is_cuda: model = model.cuda() loss_fn = loss_fn.cuda() optimizer = optim.sgd.SGD(args, model.parameters()) optimizer = optim.FairseqBMUF(args, optimizer) return model, loss_fn, optimizer
def _build_optimizer(self): param_groups = self.task.get_task_params(self.model, self.criterion) if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None): lr_list = [self.args.lr[0] for _ in param_groups] else: lr_list = [ float(lr.strip()) for lr in self.args.lr_list.split(",") ] for params, curr_lr in zip(param_groups, lr_list): if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( '| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: optimizer = optim.FairseqBMUF(self.args, self._optimizer) self._optimizers.append(optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self.args.lr = [curr_lr] lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer) lrs.step_update(0) self._lr_schedulers.append(lrs) self.args.lr = None self.set_current_optimizer()
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, ## chain函数:接受一个可迭代对象列表作为输入,并返回一个迭代器, 顺次迭代所有对象的内容 chain(self.model.parameters(), self.criterion.parameters()), ) ) ## 获取需要优化的参数列表, 此参数列表是通过nn.Module.parameters()获取,通过递归遍历Module的所有submodule和其所有的torch.nn.Parameter对象获取。这些submodule和parameter都是在Module的属性赋值时根据类型进行分类存储得到的 if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( "| WARNING: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print( "| NOTICE: your device may support faster training with --fp16" ) #通过optim/__init__.py构造的build_optimizer和args.optimizer标定的choice,直接调用对应optimizer的构造函数 #比如如果args.optimier指定为adam,则此处build_optimizer就是FairseqAdam类的构造函数 self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): #if not self.args.balance: ''' count = 1 for name, param in chain(self.model.encoder.named_parameters(), self.model.decoder.named_parameters(), self.criterion.named_parameters()): if param.requires_grad: print(count, name) count += 1 print("------") for name, param in chain(self.model.section_positions.named_parameters(), self.model.section_layernorm_embedding.named_parameters(), self.model.section.named_parameters(), self.model.w_context_vector.named_parameters(), self.model.w_proj.named_parameters()): if param.requires_grad: print(count, name) count += 1 ''' #print(len(self.model.named_parameters())) #print(list( # filter( # lambda p: p.requires_grad, # chain(self.model.encoder.parameters(), self.model.decoder.parameters()), # ) #)) #params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) #) #print("Total: ") #print(len(params)) ''' base_params = list(map(id, chain(self.model.encoder.parameters(), self.model.decoder.parameters()))) logits_params = filter(lambda p: id(p) not in base_params and p.requires_grad, self.model.parameters()) base_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,net.bn1.parameters()))+\ list(map(id,net.layer1.parameters())) + list(map(id,net.layer2.parameters())) \ + list(map(id,net.layer3.parameters())) + list(map(id,net.layer4.parameters())) new_params = filter(lambda p: id(p) not in base_params_id , net.parameters()) base_params = filter(lambda p: id(p) in base_params_id, net.parameters()) ''' new_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,self.model.section_layernorm_embedding.parameters()))+\ list(map(id,self.model.section.parameters())) + list(map(id, self.model.w_proj.parameters())) \ + list(map(id,self.model.w_context_vector.parameters())) + list(map(id,self.model.w_proj_layer_norm.parameters())) base_params = list( filter(lambda p: id(p) not in new_params_id and p.requires_grad, self.model.parameters())) print("group1: ") print(len(base_params)) new_params = list( filter(lambda p: id(p) in new_params_id and p.requires_grad, self.model.parameters())) print("group2: ") print(len(new_params)) params = [ { "params": base_params }, { "params": new_params }, ] # "weight_decay": 0.01 params2 = None ''' if self.args.balance: params = list( filter( lambda p: p.requires_grad, chain(self.model.encoder.parameters(), self.model.decoder.parameters(), self.criterion.parameters()), ) ) params2 = list( filter( lambda p: p.requires_grad, chain(self.model.w_proj.parameters(), self.model.w_context_vector.parameters(), self.model.section_positions.parameters(), self.model.section_layernorm_embedding.parameters()), ) ) ''' if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster") if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer( self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16" ) self._optimizer = optim.build_optimizer(self.args, params) self._optimizer2 = None if self.args.balance and params2 is not None: self._optimizer2 = optim.build_optimizer(self.args, params2) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.args, self.optimizer, self._optimizer2) self._lr_scheduler.step_update(0)
def _build_optimizer(self): params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if getattr(self.args, 'multiple_lr', False): assert self.args.lr_scheduler == 'multi_lr_inverse_sqrt', 'only multi_lr_inverse_sqrt supports multiple_lr now' assert len(self.args.lr) == 3, 'Three learning rates for roberta, sents_encoder and decoder should be provided' named_params = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] encoder_params = [(n, p) for n, p in self.model.encoder.named_parameters() if p.requires_grad] decoder_params = [(n, p) for n, p in self.model.decoder.named_parameters() if p.requires_grad] if hasattr(self.model, 'decoder_perm'): decoder_params += [(n, p) for n, p in self.model.decoder_perm.named_parameters() if p.requires_grad] # params = [ # {'params': [p for n, p in named_params if 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]}, # {'params': [p for n, p in named_params if not 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]}, # {'params': [p for n, p in named_params if n.startswith('decoder') or n.startswith('module.decoder')]} # ] params = [ {'params': [p for n, p in encoder_params if 'roberta' in n]}, {'params': [p for n, p in encoder_params if not 'roberta' in n]}, {'params': [p for n, p in decoder_params]} ] assert len(named_params) == sum([len(p['params']) for p in params]), named_params if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print( "| WARNING: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if self.args.memory_efficient_fp16: if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [ optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params ) else: if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [optim.FP16Optimizer.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print("| NOTICE: your device may support faster training with --fp16") if getattr(self.args, 'multiple_lr', False): self._optimizer = optim.ConcatOptimizer(self.args, [optim.build_optimizer(self.args, param['params']) for param in params]) else: self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): # TODO: Rename 'optimizers' to param_groups use_param_groups = hasattr(self.args, 'optimizers') and len(self.args.optimizers) > 0 freeze_prefix = getattr(self.args, 'freeze_prefix', None) if use_param_groups: params = list( filter( lambda np: np[1].requires_grad, chain(self.model.named_parameters(), self.criterion.named_parameters()), ) ) params = self._get_param_groups(params) elif freeze_prefix: params = list( filter( lambda np: np[1].requires_grad and not (np[0].startswith(freeze_prefix) or np[0].startswith("module." + freeze_prefix)), chain(self.model.named_parameters(), self.criterion.named_parameters()), ) ) frozen_params = list( filter( lambda np: np[1].requires_grad and (np[0].startswith(freeze_prefix) or np[0].startswith("module." + freeze_prefix)), chain(self.model.named_parameters(), self.criterion.named_parameters()), ) ) print('The following parameters are NOT FROZEN: %s' % ( ','.join([param[0] for param in params]), )) print('The following parameters are FROZEN by prefix "%s": %s' % ( freeze_prefix, ','.join([param[0] for param in frozen_params]), )) assert len(params) > 0 assert len(frozen_params) > 0 params = [param[1] for param in params] else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16, " "please switch to FP32 which is likely to be faster" ) if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.args, params ) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info("NOTE: your device may support faster training with --fp16") self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. if use_param_groups: # HACK: Current implementation of LR schedulers works fine # if we replace LR with numpy arrays of LRs. # However, FairseqOptimizer (the base class for the optimizer here) # always return just a single LR (from the first param group). # We change this behaviour for the optimizer object (so we don't need to change the base class) self.optimizer.set_lr = set_lr_group.__get__(self.optimizer, FairseqOptimizer) self.optimizer.get_lr = get_lr_group.__get__(self.optimizer, FairseqOptimizer) assert len(self.args.lr) == 1 args = copy.deepcopy(self.args) args.lr = [np.array([param_group.get('lr', self.args.lr[0]) for param_group in self.args.optimizers])] self._lr_scheduler = lr_scheduler.build_lr_scheduler(args, self.optimizer) else: self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)
def _build_optimizer(self): self.index = 0 self.index2 = 0 # params = list( # filter( # lambda p: p.requires_grad, # chain(self.model.parameters(), self.criterion.parameters()), # ) # ) # for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()): # print(n) # exit() if self.args.task=="audio_translation": params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), )) # def filter_fn(n, p): # self.index += 1 # if not p.requires_grad: # print(n) # return p.requires_grad # # print(n) # # cond = p.requires_grad # # # if self.args.fix_transformer: # # # cond &= ('audio_encoder.conv_layers' not in n and 'audio_encoder.transformer_layers' not in n and 'text_encoder' not in n) # # cond &= ('audio_encoder.conv_layers' not in n and 'audio_encoder.transformer_layers' not in n ) # # # if not cond: # # self.index2 += 1 # # p.requires_grad=False # # else: # # print(n) # # return cond # params = [p for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()) if filter_fn(n, p)] # exit() else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) # print(self.index) # print(self.index2) # print(len(params)) # exit() # # print(len(params)) # index=0 # for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()): # index+=1 # # print('{}: {}'.format(p.data[0], n)) # print(str(index)+" "+n) # print(p.requires_grad) # exit() if self.args.fp16: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: print('| WARNING: your device does NOT support faster training with --fp16, ' 'please switch to FP32 which is likely to be faster') if self.args.memory_efficient_fp16: self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') self._optimizer = optim.build_optimizer(self.args, params) if self.args.use_bmuf: self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) self._lr_scheduler.step_update(0)