def getModel(path_config, gpu='0', fp16=False): print("load model......") torch.cuda.set_device(int(gpu)) #os.environ["CUDA_VISIBLE_DEVICES"] = gpu with open(path_config, 'r') as f: config = json.load(f) from tokenizations import tokenization_bert tokenizer_path = config['tokenizer_path'] model_path = config['model_path'] device = "cuda" if torch.cuda.is_available() else "cpu" print("use device:%s" % device) tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() if fp16: optimizer = transformers.AdamW(model.parameters(), lr=0.1, correct_bias=True) from apex import amp fp16_opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) return model, tokenizer, config, device
def instantiate_model(config, tokenizer): configure_devices(config) model = Model(config) optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0) metrics = None if config.continue_training: state_dict = torch.load(config.continue_training, map_location='cpu') model.load_state_dict(state_dict['model']) if 'optimizer_state_dict' in state_dict: optimizer.load_state_dict(state_dict['optimizer_state_dict']) for g in optimizer.param_groups: g['lr'] = config.learning_rate try: print(f"Loaded model:\nEpochs: {state_dict['epoch']}\nLoss: {state_dict['loss']}\n", f"Recall: {state_dict['rec']}\nMRR: {state_dict['mrr']}") except: pass if config.use_cuda: model = model.cuda() optimizer_to(optimizer, config.device) model = torch.nn.DataParallel(model, device_ids=config.devices) return model, optimizer, metrics
def train_on_batch(self, batch): if self.optimizer is None: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": float(self.config["decay"]), }, {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=float(self.config["learning_rate"])) self.scheduler = transformers.get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=int(self.config["num_warmup_steps"]), num_training_steps=int(self.config["num_train_steps"])) self.optimizer.zero_grad() self.model.train() for k, v in batch.items(): batch[k] = v.to(self.device) batch_loss = torch.mean(self.model(**batch)["loss"]) batch_loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() return batch_loss.cpu().detach().numpy()
def _default_train_setup(opt, model, batch_loader): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=opt["learning_rate"]) loss_function = torch.nn.CrossEntropyLoss() t_total = len(batch_loader) * opt["num_epochs"] warmup_steps = int(t_total * opt["warmup_ratio"]) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, warmup_steps, t_total) return loss_function, optimizer, scheduler,
def _get_optimizer(self): parameters = self.model.parameters() optimizer = transformers.AdamW( params=parameters, lr=self._learning_rate) return optimizer
def _configure_training(self, n_batches_train): """ Configures training component: 1. optimizer 2. scheduler Parameters ---------- n_batches_train : int Number of batches of training data Returns ------- optimizer : transformers.optimization.AdamW Optimizer scheduler : torch.optim.lr_scheduler.LambdaLR Scheduler """ # Create optimizer params = filter(lambda x: x.requires_grad, self.model.parameters()) optimizer = transformers.AdamW(params, lr=self.alpha, eps=1e-8) # Total number of training steps is number of batches * number of epochs. total_steps = n_batches_train * self.epochs # Create the learning rate scheduler scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) return optimizer, scheduler
def configure_optimizers(self): model = self.model no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.hparams.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) scheduler = { 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
def main(): hvd.init() config = json.load(open('config.json')) torch.cuda.set_device(hvd.local_rank()) writer = Logging(user='******', name=f'albert_mlm_{hvd.local_rank()}') writer.add_hparams(config) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainset = dataset.dataset.MaskedLMDataset( csv_file=config['mask_train_file'], config=config) trainsampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=hvd.size(), rank=hvd.rank()) trainloader = torch.utils.data.DataLoader( dataset=trainset, batch_size=config['batch_size'], num_workers=config['num_workers'], sampler=trainsampler) validset = dataset.dataset.MaskedLMDataset( csv_file=config['mask_valid_file'], config=config) validsampler = torch.utils.data.distributed.DistributedSampler( validset, num_replicas=hvd.size(), rank=hvd.rank()) validloader = torch.utils.data.DataLoader( dataset=validset, batch_size=config['batch_size'], num_workers=config['num_workers'], sampler=validsampler) net = model.bert.Model(config=config).to(device) optimizer = transformers.AdamW(params=net.parameters(), lr=config['start_lr'], weight_decay=config['weight_decay']) optimizer = hvd.DistributedOptimizer( optimizer=optimizer, named_parameters=net.named_parameters()) scheduler_1 = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=config['T_max'], eta_min=config['eta_min']) scheduler_2 = torch.optim.lr_scheduler.CyclicLR( optimizer=optimizer, base_lr=config['base_lr'], max_lr=config['max_lr'], step_size_up=config['step_size_up'], step_size_down=config['step_size_down'], cycle_momentum=False) scheduler = [scheduler_1, scheduler_2] hvd.broadcast_parameters(net.state_dict(), root_rank=0) training_method = engine.bert.TrainingClass(model=net, optimizer=optimizer, scheduler=scheduler, config=config, trainloader=trainloader, validloader=validloader, writer=writer, device=device, tokenizer=trainset.tokenizer)
def get_opt_and_sched(model): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=start_lr, eps=1e-8) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_epochs * len(dataset), ) return optimizer, scheduler
def __setup_model_data(self, dataset, lower_case): """ set up data/language model """ if self.model is not None: return if self.args.is_trained: self.model = transformers.AutoModelForTokenClassification.from_pretrained(self.args.transformers_model) self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir) self.label_to_id = self.model.config.label2id self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner( dataset, label_to_id=self.label_to_id, fix_label_dict=True, lower_case=lower_case) self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()} else: self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner( dataset, lower_case=lower_case) self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()} config = transformers.AutoConfig.from_pretrained( self.args.transformers_model, num_labels=len(self.label_to_id), id2label=self.id_to_label, label2id=self.label_to_id, cache_dir=self.cache_dir) self.model = transformers.AutoModelForTokenClassification.from_pretrained( self.args.transformers_model, config=config) self.transforms = Transforms(self.args.transformers_model, cache_dir=self.cache_dir) # optimizer no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay}, {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=1e-8) # scheduler self.scheduler = transformers.get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_step, num_training_steps=self.args.total_step) # GPU allocation self.model.to(self.device) # GPU mixture precision if self.args.fp16: try: from apex import amp # noqa: F401 self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O1', max_loss_scale=2 ** 13, min_loss_scale=1e-5) self.master_params = amp.master_params self.scale_loss = amp.scale_loss logging.info('using `apex.amp`') except ImportError: logging.exception("Skip apex: please install apex from https://www.github.com/nvidia/apex to use fp16") # multi-gpus if self.n_gpu > 1: # multi-gpu training (should be after apex fp16 initialization) self.model = torch.nn.DataParallel(self.model.cuda()) logging.info('using `torch.nn.DataParallel`') logging.info('running on %i GPUs' % self.n_gpu)
def setup_optimizer_and_scheduler(self): def _filter_params(parameters: List, filters: List[str], exclude: bool = True) -> List[str]: if exclude: return [ parameter for name, parameter in parameters if not any(param in name for param in filters) ] else: return [ parameter for name, parameter in parameters if any(param in name for param in filters) ] model_params = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_params = [{ 'params': _filter_params(params=model_params, filter=no_decay, exclude=True), 'weight_decay': 0.001 }, { 'params': _filter_params(params=model_params, filter=no_decay, exclude=False), 'weight_decay': 0.0 }] self.optimizer = transformers.AdamW(optimizer_params, lr=1e-4) self.scheduler = transformers.get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps)
def config_optimizer( model, learning_rate, adam_eps, freeze_decoder=False, freeze_bert=False, freeze_embeddings=False, ): for param in model.model.parameters(): param.requires_grad = False if not freeze_decoder: for param in model.model.mt_model.model.decoder.parameters(): param.requires_grad = True if not freeze_bert: for param in model.model.bert.parameters(): param.requires_grad = True if freeze_embeddings: for param in model.model.mt_model.model.decoder.embed_tokens.parameters() : param.requires_grad = False for param in model.model.bert.bert.embeddings.parameters(): param.requires_grad = False #optimizer = AdamW(self.model.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) print(f'Optimizer will update parameters for the decoder:{not freeze_decoder} AND for bert:{not freeze_bert}. freeze_embeddings:{freeze_embeddings}') optimizer = transformers.AdamW( filter(lambda p: p.requires_grad, model.model.parameters()), lr=learning_rate, eps=adam_eps ) return optimizer
def initialize_model(epochs): model = RegressionModel() model.to(device) optimizer = transformers.AdamW(model.parameters()) total_steps = len(train_dataloader) * epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) return model, optimizer, scheduler
def configure_optimizers(self): opt = transformers.AdamW(self.siamese_model.parameters(), **self.hparams["optimizer"]) output = opt if "scheduler" in self.hparams: scheduler = get_linear_schedule_with_warmup( opt, **self.hparams["scheduler"]) output = ([opt], [scheduler]) return output
def train(tokenizer, model, train_dataset, validate_dataset, args): train_dataloader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True ) validate_dataloader = DataLoader(validate_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True) early_stopping = EarlyStopping(args.patience, verbose=True, save_path=args.save_model_path) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.epochs optimizer = transformers.AdamW(model.parameters(), lr=args.lr, eps=args.eps) num_warmup_steps = int(t_total * args.warmup_steps_rate) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total ) logger.info('starting training') # 用于记录每个epoch训练和验证的loss train_losses, validate_losses = [], [] # 记录验证集的最小loss best_val_loss = 10000 # 开始训练 train_iterator = trange(int(args.epochs), desc='Epoch', mininterval=0) for epoch in train_iterator: # ========== train ========== # train_loss = train_epoch( model=model, train_dataloader=train_dataloader, optimizer=optimizer, scheduler=scheduler, epoch=epoch, args=args) train_losses.append(train_loss) # ========== validate ========== # validate_loss = validate_epoch( model=model, validate_dataloader=validate_dataloader, epoch=epoch, args=args) validate_losses.append(validate_loss) # 保存当前困惑度最低的模型 if validate_loss < best_val_loss: best_val_loss = validate_loss logger.info('saving current best model for epoch {}'.format(epoch + 1)) model_path = join(args.save_model_path, 'min_ppl_model'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) tokenizer.save_pretrained(model_path) # 如果patience=0,则不进行early stopping if args.patience == 0: continue early_stopping(validate_loss, model) if early_stopping.early_stop: logger.info("Early stopping") break logger.info('training finished') logger.info("train_losses:{}".format(train_losses)) logger.info("validate_losses:{}".format(validate_losses))
def __init__(self, model: Any, model_name: str = None): super().__init__(model) self.model_name = model_name self.device = xm.xla_device() self.optimizer = transformers.AdamW(self.model.parameters(), lr=1e-4 * xm.xrt_world_size()) self.criterion = nn.BCEWithLogitsLoss() self.early_stopping = utils.EarlyStopping(patience=5, verbose=True) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
def __init__(self, model: Any, model_name: str = None): super().__init__(model) self.model_name = model_name self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.optimizer = transformers.AdamW(self.model.parameters(), lr=1e-4) self.criterion = nn.BCEWithLogitsLoss() self.early_stopping = EarlyStopping(patience=5, verbose=True) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
def train(dataloader, model, device, total_steps=None): model.train() # 统计总参数量 total = sum(p.numel() for p in model.parameters()) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) logging.info(' *** start training, parameter total:{}, trainable:{} *** '.format(total, trainable)) # 定义优化器与scheduler optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.get_linear_schedule_with_warmup(optimizer, args.warmup_step, total_steps) logging.info(' ----------- Start Training --------------') for epoch in range(args.num_epoch): total_loss = 0 for i, batch_inputs in enumerate(dataloader): optimizer.zero_grad() batch_inputs.to(device) outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) total_loss += loss.item() optimizer.step() scheduler.step() if (i + 1) % 500 == 0 or i == 0: logging.info('[ {4}: {5}, Epoch {0}: {1}/{2} AVG_LOSS: {3} ]'.format( epoch + 1, i + 1, len(dataloader), total_loss / (i + 1), datetime.now().hour, datetime.now().minute)) logging.info('\n *** In Epoch {0}, average loss: {1} *** \n'.format( epoch + 1, total_loss / len(dataloader)) ) logging.info('Saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) epoch_model_dir = os.path.join(args.output_dir, 'model_epoch_{}'.format(epoch + 1 + args.pretrained_epoch)) if not os.path.exists(epoch_model_dir): os.mkdir(epoch_model_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(epoch_model_dir)
def build_model(self): args = self.args output_dim = self.input_dim input_dim = self.input_dim self.best_model = None # TODO: change all modules param to single config, # change input_dim and output_dim to args.vocab_size self.model = models.AR.build( args, input_dim, output_dim, self.vocab, self.embeddings, self.pretrain_feature_model).to(self.device) self.optimizer = transformers.AdamW( self.model.parameters(), lr=args.lr, correct_bias=True, #self.optimizer = optim.AdamW(self.model.parameters(), lr=args.lr, #self.optimizer = toptim.Lamb(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) self.logger.info(self.model) self.logger.info( f'The model has {utils.count_parameters(self.model):,} trainable parameters' ) if args.use_scheduler: #self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, 1.0, gamma=0.95) #self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optimizer, 2) if args.warmup_steps == 0: self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=0.5, min_lr=1.5e-4, patience=60, verbose=True) else: # XXX: scheduler will run once at start, even if has no scheduler.step() total_steps = int( len(self.train_iter.dataset) * args.n_epochs / args.batch_size / args.gradient_accumulation) self.scheduler = transformers.get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=total_steps) if args.pretrained_fname is None: pass # pytorch module will auto init_weights with uniform # self.model.apply(models.init_weights) else: self.logger.info() self.logger.info( f'Load pretrained model {args.pretrained_fname}...') self.load_model()
def configure_optimizers(self): optimizer = transformers.AdamW(self.parameters(), lr=self.learning_rate) warmup_steps = self.steps_per_epoch // 3 total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, warmup_steps, total_steps) return [optimizer], [scheduler]
def build_model_and_get_results(encoded_plus_list, labels_torch, part, n_parts, device, n_classes, learning_rate, epochs): val_len = len(encoded_plus_list) // n_parts start = val_len * part end = val_len * (part + 1) encoded_plus_val = encoded_plus_list[start:end] labels_val = labels_torch[start:end] encoded_plus_train = encoded_plus_list[:start] + encoded_plus_list[end:] labels_train = torch.cat([labels_torch[:start], labels_torch[end:]]) model = Classifier(n_classes=n_classes).to(device) optimizer = transformers.AdamW(model.parameters(), lr=learning_rate, correct_bias=False) loss_fn = nn.CrossEntropyLoss().to(device) best_f1_score = -1 for epoch in range(epochs): print(f"On epoch {epoch + 1} of {epochs}") train_acc, train_f1_score, train_loss = train_epoch( model, encoded_plus_train, labels_train, loss_fn, optimizer, device, len(encoded_plus_train)) print( f'Train loss {train_loss} Train f1-score {train_f1_score} accuracy {train_acc}' ) val_acc, val_f1_score, val_loss, cf_matrix = eval_model( model, encoded_plus_val, labels_val, loss_fn, device, len(encoded_plus_val)) print( f'Val loss {val_loss} f1-score {val_f1_score} accuracy {val_acc}' ) print(f"confusion matrix: {cf_matrix}") if val_f1_score > best_f1_score: best_model_dict = deepcopy(model.state_dict()) best_cf_matrix = cf_matrix best_f1_score = val_f1_score torch.save(best_model_dict, p.NEW_MODEL_NAME.format(i)) return best_f1_score, best_cf_matrix
def configure_optimizers( self): # Scheduler can be changed with a one without hard_restarts optimizer = transformers.AdamW(self.parameters(), lr=self.learning_rate) warmup_steps = self.steps_per_epoch // 3 # First third of the epoch is warmup to fasten the training process total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps # We use default 1 hard restart scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, warmup_steps, total_steps) return [optimizer], [scheduler]
def __init__(self): self.max_seq = 100 self.classifier = Net(classes=2) if torch.cuda.is_available(): self.classifier.to('cuda') self.criterion = nn.CrossEntropyLoss() self.optim = transformers.AdamW(self.classifier.parameters(), lr=5e-5) self.writer = SummaryWriter('logs/') self.val_epoch_step = 0 self.epochs = 10 self.log_file = 'logs/logs.txt' self.prepare_dataset() self.start_epoch = 0
def init_optim(self): param_optimizer = list(self.model.named_parameters()) # 模型参数名字列表 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # self.optimizer = BertAdam(optimizer_grouped_parameters, # lr=self.opt['lr'], # warmup=self.opt['warmup'], # t_total=len(self.dataset_loader['train']) * self.epoch # ) # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr']) self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.opt['lr'])
def configure_optimizers(self): # optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr']) optimizer = transformers.AdamW( self.parameters(), lr=self.config['lr']) #, weight_decay=0.01 scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=350, num_training_steps=3000, num_cycles=1) schedulers = [{ 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 }] return [optimizer], schedulers
def _create_optimizer(self): """Create the optimzier instance used for training and wrap with distributed library if need. Return: True if optimizer instance is created successfully. """ if self._args.distributed_impl == DistributedImpl.DDP: self._model = torch.nn.parallel.DistributedDataParallel( self._model, device_ids=[self._local_rank], output_device=self._local_rank) if self._optimizer_type == Optimizer.SGD: self._optimizer = torch.optim.SGD(self._model.parameters(), lr=1e-5, momentum=0.9, weight_decay=1e-4, nesterov=True) elif self._optimizer_type == Optimizer.ADAM: self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) elif self._optimizer_type == Optimizer.ADAMW: self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) else: self._optimizer = None if not self._optimizer: logger.error( 'Create optimizer failed - model: {}, optimizer type: {}.'. format(self._name, self._optimizer_type)) return False if self._args.distributed_impl == DistributedImpl.HOROVOD: import horovod.torch as hvd self._optimizer = hvd.DistributedOptimizer( self._optimizer, named_parameters=self._model.named_parameters(), compression=hvd.Compression.none, op=hvd.Average) hvd.broadcast_parameters(self._model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self._optimizer, root_rank=0) return True
def make_optimizer(model, optimizer_name="AdamW", sam=False): optimizer_grouped_parameters = get_optimizer_params(model) kwargs = { 'lr': 5e-5, 'weight_decay': 0.01, # 'betas': (0.9, 0.98), # 'eps': 1e-06 } if sam: if optimizer_name == "LAMB": optimizer = Lamb(optimizer_grouped_parameters, **kwargs) return optimizer elif optimizer_name == "Adam": from torch.optim import Adam optimizer = Adam(optimizer_grouped_parameters, **kwargs) return optimizer elif optimizer_name == "AdamW": optimizer = transformers.AdamW(optimizer_grouped_parameters, **kwargs) return optimizer else: raise Exception('Unknown optimizer: {}'.format(optimizer_name)) else: if optimizer_name == "LAMB": base_optimizer = Lamb optimizer = SAM(optimizer_grouped_parameters, base_optimizer, rho=0.05, **kwargs) return optimizer elif optimizer_name == "Adam": from torch.optim import Adam base_optimizer = Adam optimizer = SAM(optimizer_grouped_parameters, base_optimizer, rho=0.05, **kwargs) return optimizer elif optimizer_name == "AdamW": from transformers import AdamW base_optimizer = AdamW optimizer = SAM(optimizer_grouped_parameters, base_optimizer, rho=0.05, **kwargs) return optimizer else: raise Exception('Unknown optimizer: {}'.format(optimizer_name))
def init_optim(self): param_optimizer = list(self.model.bert.named_parameters()) # 模型参数名字列表 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer] }] fc_optimizer = list(self.model.fc.named_parameters()) # 模型参数名字列表 optimizer_grouped_parameters += [{ 'params': [p for n, p in fc_optimizer], 'lr': self.opt['lr_sasrec'] }] # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr']) self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.opt['lr_bert'])
def configure_optimizer(m, h): prelim_groups = {} opt_config = h["opt_config"] for (name, param) in m.named_parameters(): add_to_group(opt_config, prelim_groups, name, param) groups = [] for _, gps in prelim_groups.items(): for gp in gps: if len(gp["params"]) > 0: groups.append(gp) optimizer = transformers.AdamW(params=groups) lr_scheduler = transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=h["num_warmup_steps"]) return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
def get_optim(self, model): optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters()], "weight_decay": self.weight_decay }, ] optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.lr, weight_decay=self.weight_decay) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.total_steps) return optimizer, scheduler