def get_scheduler(settings: Dict[str, Any], optimizer: torch.optim.Optimizer): scheduler_name = settings.get('scheduler') scheduler: _LRScheduler if scheduler_name is None: return None elif scheduler_name == 'ExponentialDecay': scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda e: 1 / (1 + e), last_epoch=-1) elif scheduler_name == 'linear_decay': scheduler = get_linear_schedule_with_warmup( optimizer, 0, settings['num_total_steps'], ) elif scheduler_name == 'linear_decay_with_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, settings['warmup_epochs'] * settings['num_batches'], settings['num_total_steps'], ) elif scheduler_name == 'cosine_decay': scheduler = get_cosine_schedule_with_warmup( optimizer, 0, settings['num_total_steps'], ) elif scheduler_name == 'cosine_decay_with_warmup': scheduler = get_cosine_schedule_with_warmup( optimizer, settings['warmup_epochs'] * settings['num_batches'], settings['num_total_steps'], ) return scheduler
def set_optimizers(args, model): lr = args.lr if args.mode == 'finetune': lr_adapters = args.lr_adapters else: lr_adapters = args.lr_adapters * 0.1 # update all layers named_params = dict(model.named_parameters()) params_to_optimize_via_AdamW = [] named_params_to_optimize_via_AdamW = [] params_to_optimize_via_AdamW2 = [] named_params_to_optimize_via_AdamW2 = [] for name, param in named_params.items(): if 'bert_saved' in name: continue if 'classifiers' in name: if '.{}.'.format(model.module.datasets.index( args.dataset)) in name: params_to_optimize_via_AdamW2.append(param) named_params_to_optimize_via_AdamW2.append(name) continue elif 'ada' in name: params_to_optimize_via_AdamW.append(param) named_params_to_optimize_via_AdamW.append(name) continue elif not (True in [ele in name for ele in args.shared_layers]): continue else: params_to_optimize_via_AdamW2.append(param) named_params_to_optimize_via_AdamW2.append(name) lr_adapters = AdamW(params_to_optimize_via_AdamW, lr=lr_adapters, weight_decay=1e-8) optimizer_network = AdamW(params_to_optimize_via_AdamW2, lr=lr, weight_decay=0.0) optimizers = Optimizers() optimizers.add(optimizer_network, lr) optimizers.add(lr_adapters, lr_adapters) scheduler_network = None if args.mode == 'finetune': scheduler_network = get_cosine_schedule_with_warmup( optimizer_network, 0, args.training_steps) elif args.mode == 'prune': scheduler_network = get_cosine_schedule_with_warmup( optimizer_network, 0, args.training_steps) scheduler_adapters = get_cosine_schedule_with_warmup( lr_adapters, 0, int(args.training_steps * 1.1)) schedulers = Schedulers() schedulers.add(scheduler_network) schedulers.add(scheduler_adapters) return optimizers, schedulers
def test_warmup_cosine_scheduler(self): scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
def init_opt(args, model, logger): if args.optimizer == 'adam': # Adam with transformer schedule has a different set of default hyperparameters: if args.lr_schedule == 'transformer': opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.weight_decay) else: opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) elif args.optimizer == 'adamw': opt = AdamW(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) elif args.optimizer == 'radam': import radam if args.warmup > 1: logger.warning('With RAdam optimizer, warmup is never applied') opt = radam.RAdam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) else: assert args.optimizer == 'sgd' opt = torch.optim.SGD(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) if args.lr_schedule == 'transformer': lr_lambda = partial(get_transformer_learning_rate, dimension=args.dimension, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) elif args.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( opt, num_warmup_steps=args.warmup) elif args.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup) elif args.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup, num_cycles=0.5) elif args.lr_schedule == 'sgd': lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) else: raise ValueError('Invalid learning rate scheduler.') return opt, scheduler
def train_on_batch(self, batch): if self.optimizer is None: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": float(self.config["decay"]), }, {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=float(self.config["learning_rate"])) self.scheduler = transformers.get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=int(self.config["num_warmup_steps"]), num_training_steps=int(self.config["num_train_steps"])) self.optimizer.zero_grad() self.model.train() for k, v in batch.items(): batch[k] = v.to(self.device) batch_loss = torch.mean(self.model(**batch)["loss"]) batch_loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() return batch_loss.cpu().detach().numpy()
def init_scheduler(self, optimizer, total_steps, warmup_steps): """ Initialization of lr scheduler. :param optimizer: The optimizer that is used for the training. :type optimizer: Optimizer :return: Created scheduler. :rtype: LambdaLR """ lastEpoch = -1 if self.config["scheduler"] == "linear": scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, last_epoch=lastEpoch) elif self.config["scheduler"] == "cosine": scheduler = transformers.get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, num_cycles=0.5, last_epoch=lastEpoch) elif self.config["scheduler"] == "constant": scheduler = transformers.get_constant_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=warmup_steps, last_epoch=lastEpoch) else: scheduler = None return scheduler
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, num_total: int): assert scheduler in [ "constantlr", "warmuplinear", "warmupconstant", "warmupcosine", "warmupcosinewithhardrestarts" ], ('scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]' ) if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total)
def get_scheduler(optimizer, warmup_size, total_steps): scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=round(total_steps * warmup_size), num_training_steps=total_steps, ) return scheduler
def __init__(self, model_cls, tokenizer, sampling_function, scoring_function, **params): """ Initialize PGTrainer. Args: model (torch.model): pi_theta(x) Policy to be trained e.g. Hugging Face transformer GPT2 model with value head orig_model (torch.model): original model before any training: a(x) in the equation above. e.g. Hugging Face transformer GPT2 original model params (dict or None): Vanilla PG parameters for training. Can include following keys: 'lr' (float): Adam learning rate, default: 1.41e-5 'batch_size' (int): Number of samples per optimisation step, default: 256 'forward_batch_size' (int): Number of samples forward passed through model at a time, default: 16 'minibatch_epochs' (int): Number of optimisation epochs per batch of samples, default: 4 """ super().__init__(tokenizer=tokenizer, sampling_function=sampling_function, scoring_function=scoring_function) self.params = self.default_params self.params.update(params) # pi_theta policy to be learned self.model = model_cls.from_pretrained(params['lm_name']).to( params['gpt2_device']) # original model for computing kl(pi||a) self.orig_model = model_cls.from_pretrained(params['lm_name']).to( params['gpt2_orig_device']) self.ref_model = self.orig_model self.is_policy_eval = True #optimzier self.optimizer = Adam(self.model.parameters(), lr=self.params['lr']) # scheduler scheduler_ = self.params['scheduler'] assert scheduler_ in ['cosine', 'constant', 'linear'], "unknown scheduler: {}".format( self.params['scheduler']) if scheduler_ == 'constant': self.scheduler = get_constant_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine': print("Cosine scheduler...") self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size']) elif scheduler_ == 'linear': self.scheduler = get_linear_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) self.params['gradient_accumulation_steps'] = self.params[ 'batch_size'] // self.params['forward_batch_size']
def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int): """ Returns the correct learning rate scheduler """ scheduler = scheduler.lower() if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: raise ValueError("Unknown scheduler {}".format(scheduler))
def run(self, train_data, train_target, val_data, val_target, test_data, test_target, epochs: int): # prepare the data for binary classification train_data, train_mask, train_target = self.preprocess(train_data, train_target, self.max_label_len, self.target_columns) val_data, val_mask, val_target = self.preprocess(val_data, val_target, self.max_label_len, self.target_columns) test_data, test_mask, test_target = self.preprocess(test_data, test_target, self.max_label_len, self.target_columns) if self.args["model"] in ["distilbert", "bert", "xlnet", "lstm", "roberta", "distilroberta"]: if self.args["optimizer"] == "adam": self.optimizer = optim.Adam(self.model.parameters(), self.learningRate) elif self.args["optimizer"] == "sgd": self.optimizer = torch.optim.SGD(self.model.parameters(), self.learningRate) else: # use adam as default optimizer self.optimizer = optim.Adam(self.model.parameters(), self.learningRate) # implement learning rate scheduler to reduce learning rate after a defined time of steps if ~bool(self.learningRateScheduler) and self.doLearningRateScheduler: num_train_steps = epochs * math.ceil(train_data.shape[0] / self.train_batchSize) self.learningRateScheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=int(0.1*num_train_steps), num_training_steps=num_train_steps) self.model.to(self.device) # train the model for the defined number of epochs after each epoch do validation for i in range(epochs): print("epoch {}".format(i)) self.train(train_data, train_mask, train_target, device= self.device) self.test_validate(val_data, val_mask, val_target, type= "validate", device= self.device) self.test_validate(test_data, test_mask, test_target, type= "test", device= self.device) else: # train sklearn based model without epochs self.train(train_data, train_mask, train_target, device=self.device) self.test_validate(val_data, val_mask, val_target, type="validate", device=self.device) self.test_validate(test_data, test_mask, test_target, type="test", device=self.device)
def make_scheduler(optimizer: torch.optim.Optimizer, scheduler_name: str = 'linear', num_training_steps: int = None, num_warmup_steps: int = None): if scheduler_name == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, ) if scheduler_name == 'step': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1) elif scheduler_name == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_training_steps) elif scheduler_name == "cosine_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) elif scheduler_name == "linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) else: raise Exception('Unknown lr scheduler: {}'.format(scheduler_name)) return scheduler
def optimizer_scheduler(model, batch_len): optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8) #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 5, num_training_steps = EPOCH) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=WARMING_UP * batch_len, num_training_steps=EPOCH * batch_len) #scheduler = get_constant_schedule(optimizer) return (optimizer, scheduler)
def _reset_opts(self): """Resets the optimizer and learning rate scheduler""" self.optimizer = AdamW(self.model.parameters(), lr=self.params['lr'], weight_decay=self.params['wd']) self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=self.params['warmup'], num_training_steps=self.params['epochs'] * len(self.train_loader), )
def configure_optimizers(self): optimizer = transformers.AdamW(self.parameters(), lr=self.learning_rate) warmup_steps = self.steps_per_epoch // 3 total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, warmup_steps, total_steps) return [optimizer], [scheduler]
def prepare_optimizer(self): # differential lr for each sub module first self.differential_lr() # optimizer if self.config.optimizer_name == "Adam": self.optimizer = torch.optim.Adam(self.optimizer_grouped_parameters, eps=self.config.adam_epsilon) elif self.config.optimizer_name == "Ranger": self.optimizer = Ranger(self.optimizer_grouped_parameters) elif self.config.optimizer_name == "AdamW": self.optimizer = AdamW(self.optimizer_grouped_parameters, eps=self.config.adam_epsilon, betas=(0.9, 0.999)) elif self.config.optimizer_name == "FusedAdam": self.optimizer = FusedAdam(self.optimizer_grouped_parameters, bias_correction=False) else: raise NotImplementedError # lr scheduler if self.config.lr_scheduler_name == "WarmupCosineAnealing": num_train_optimization_steps = self.config.num_epoch * len(self.train_data_loader) \ // self.config.accumulation_steps self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=num_train_optimization_steps) self.lr_scheduler_each_iter = True elif self.config.lr_scheduler_name == "WarmRestart": self.scheduler = WarmRestart(self.optimizer, T_max=5, T_mult=1, eta_min=1e-6) self.lr_scheduler_each_iter = False elif self.config.lr_scheduler_name == "WarmupLinear": num_train_optimization_steps = self.config.num_epoch * len(self.train_data_loader) \ // self.config.accumulation_steps self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=num_train_optimization_steps) self.lr_scheduler_each_iter = True elif self.config.lr_scheduler_name == "ReduceLROnPlateau": self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', factor=0.6, patience=1, min_lr=1e-7) self.lr_scheduler_each_iter = False elif self.config.lr_scheduler_name == "WarmupConstant": self.scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=self.config.warmup_steps) self.lr_scheduler_each_iter = True else: raise NotImplementedError # lr scheduler step for checkpoints if self.lr_scheduler_each_iter: self.scheduler.step(self.step) else: self.scheduler.step(self.epoch)
def configure_optimizers(self): """Configures optimizer for pytorch lightning.""" optimizer_dict = { "sgd": optim.SGD, "adam": optim.Adam } optimizer = optimizer_dict[self.config['optimizer']] self.optimizer = optimizer(self.model.parameters(), self.config['lr']) self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_training_steps=config['n_steps'], num_warmup_steps=int(0.10*config['n_steps'])) return [self.optimizer], self.scheduler
def configure_optimizers(self): optimizer = torch.optim.Adam(params=[ { "params": self.model.roberta.parameters(), "lr": self.lr_base }, { "params": self.model.classifier.parameters(), "lr": self.lr_linear }, ]) scheduler = get_cosine_schedule_with_warmup(optimizer, 0, 20) return [optimizer], [scheduler]
def _initialise_lr_scheduler(self, optimizer): num_batches = len(self.datasets['train']) // self.hparams.batch_size num_training_steps = num_batches // self.hparams.accumulate_grad_batches * self.hparams.max_epochs warmup_steps = int(num_training_steps * self.hparams.warmup_proportion) if self.hparams.learning_rate_scheduler == 'linear_with_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'cosine_with_hard_restarts_warmup': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps, num_cycles=1) elif self.hparams.learning_rate_scheduler == 'cosine_schedule_with_warmup': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'constant_schedule_with_warmup': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif self.hparams.learning_rate_scheduler == 'cosine_annealing_warm_restarts': scheduler = CosineAnnealingWarmRestarts(optimizer, warmup_steps) elif self.hparams.learning_rate_scheduler == 'reduce_on_plateau': scheduler = ReduceLROnPlateau(optimizer) elif self.hparams.learning_rate_scheduler == 'constant': scheduler = StepLR(optimizer, 10, gamma=1.0) else: raise ValueError( f'learning_rate_scheduler needs to be one of ' f'linear_with_warmup, cosine_with_hard_restarts_warmup, cosine_schedule_with_warmup, ' f'constant_schedule_with_warmup, cosine_annealing_warm_restarts, reduce_on_plateau, ' f'step_lr. ' f'Given: {self.hparams.learning_rate_scheduler}') logger.info(f'SCHEDULER: {self.hparams.learning_rate_scheduler} ' f'num_batches={num_batches} ' f'num_training_steps={num_training_steps} ' f'warmup_steps={warmup_steps}') return { 'scheduler': scheduler, 'monitor': 'valid_loss', 'interval': 'step', 'frequency': 1 }
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.generator.named_parameters() if not any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.config.weight_decay, }, { "params": [ p for n, p in self.generator.named_parameters() if any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] t_total = self.config.num_steps optimizer = Lamb(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.epsilon) if self.config.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps) scheduler_config = {'scheduler': scheduler, 'interval': 'step'} return [optimizer], [scheduler_config]
def configure_optimizers(self): print(self.lr) optimizer = Adam(self.parameters(), betas=(0.9, 0.98), lr=self.lr, eps=1e-9) scheduler = get_cosine_schedule_with_warmup(optimizer, self.warmup_steps, 18020) return [optimizer], [{ "scheduler": scheduler, "interval": "step", "frequency": 1, "monitor": "val_loss", "strict": True, "name": "lr", }]
def __init__(self, model: BertBinaryClassification, train_dataloader: DataLoader, validation_dataloader: DataLoader, test_dataloader: DataLoader, epochs: int = 3): """ :param model: a BertBinaryClassification model :param train_dataloader: a torch dataloader for training data :param validation_dataloader: a torch dataloader for validation data :param test_dataloader: a torch dataloader for test data :param epochs: number of training epochs """ self.model = model # Use Cuda if Cuda enabled GPU is available self.device = torch.device("cpu") if torch.cuda.is_available(): self.device = torch.device("cuda") print('Using device:', torch.cuda.get_device_name(0)) else: print('Using CPU') # Model is moved to device in-place, but tensors are not: # Source: https://discuss.pytorch.org/t/model-move-to-device-gpu/105620 self.model.to(self.device) self.model.set_class_weights(self.model.class_weights.to(self.device)) self.train_dataloader = train_dataloader self.validation_dataloader = validation_dataloader self.test_dataloader = test_dataloader self.optimizer = AdamW( self.model.parameters(), lr=2e-5, # base learning rate (TODO: do HPO on this parameter) weight_decay=0.001 # weight decay (TODO: HPO) ) self.epochs = epochs total_steps = len(self.train_dataloader) * self.epochs # Create the learning rate scheduler. # TODO: e.g. get_cosine_with_hard_restarts_schedule_with_warmup self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=0, # warm start num_training_steps=total_steps)
def get_single_optim_sched( model, num_data, lr, total_epochs, batch_size, grad_accum_step, warmup_frac, adam_decay_rate=0.01): num_train_steps = int(num_data*total_epochs/(batch_size*grad_accum_step)) params = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] is_backbone = lambda n: 'bert' in n lr_transformer = lr lr_head = lr*500 optimizer_parameters = [ { 'params': [p for n, p in params if is_backbone(n) and not any(nd in n for nd in no_decay)], 'weight_decay': adam_decay_rate, 'lr': lr_transformer, }, { 'params': [p for n, p in params if is_backbone(n) and any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': lr_transformer, }, { 'params': [p for n, p in params if not is_backbone(n)], 'weight_decay': adam_decay_rate, 'lr': lr_head, }, ] optimizer = AdamW(optimizer_parameters) num_warmup_steps = int(num_train_steps*warmup_frac) lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) return optimizer, lr_scheduler
def configure_optimizers(self): # optimizer optimizer = optim.AdamW( self.model.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay, ) # lr warmup scheduler self.warmup_steps = math.ceil(self.step_total * self.hparams.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.step_total, ) return optimizer, scheduler
def build_scheduler(optimizer, cfg, num_training_steps): scheduler_type = cfg['scheduler'] warmup_steps = cfg['scheduler_warmup_steps'] if scheduler_type == 'constant_schedule_with_warmup': scheduler = transformers.get_constant_schedule_with_warmup( optimizer, warmup_steps) return scheduler elif scheduler_type == 'cosine_schedule_with_warmup': scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, warmup_steps, num_training_steps) return scheduler else: raise Exception( 'Scheduler name invalid, choices are: "constant_schedule_with_warmup"' + '\n' + 'or "cosine_schedule_with_warmup"')
def get_lr_scheduler(optimizer, scheduler_type, warmup_steps=None, num_steps=None, last_epoch=-1): if scheduler_type == "linear": scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, num_steps) elif scheduler_type == "constant": scheduler = get_constant_schedule(optimizer) elif scheduler_type == "cosine": scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, num_steps, last_epoch=last_epoch) else: raise ValueError("Unknown scheduler_type:", scheduler_type) return scheduler
def train(folds, model, optimizer): cv = [] # 保存每折的最佳准确率 for fold, (trn_idx, val_idx) in enumerate(folds): train_x = np.array(X)[trn_idx] train_y = np.array(y)[trn_idx] val_x = np.array(X)[val_idx] val_y = np.array(y)[val_idx] train_set = MyDataset(train_x, train_y) val_set = MyDataset(val_x, val_y) ## num_workers:这个参数决定了有几个进程来处理data loading。0意味着所有的数据都会被load进主进程。(默认为0) train_loader = DataLoader(train_set, batch_size=conf['train_bs'], collate_fn=collate_fn, shuffle=True, num_workers=conf['num_workers']) val_loader = DataLoader(val_set, batch_size=conf['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=conf['num_workers']) best_acc = 0 # model = BertForMultipleChoice.from_pretrained(conf['model']).to(conf['device']) # 模型 scaler = GradScaler() # optimizer = AdamW(model.parameters(), lr=conf['lr'], weight_decay=conf['weight_decay']) # AdamW优化器 criterion = nn.CrossEntropyLoss() # warmup 需要在训练最初使用较小的学习率来启动,并很快切换到大学习率而后进行常见的 decay # get_cosine_schedule_with_warmup策略,学习率先warmup一个epoch,然后cos式下降 scheduler = get_cosine_schedule_with_warmup(optimizer, len(train_loader) // conf['accum_iter'], conf['epochs'] * len(train_loader) // conf['accum_iter']) for epoch in range(conf['epochs']): print('epoch:', epoch) train_loss, train_acc = train_model(model, train_loader, optimizer, scheduler, criterion, scaler) val_loss, val_acc = test_model(model, val_loader, criterion) if val_acc > best_acc: best_acc = val_acc torch.save(model.state_dict(), '../save/{}_fold_{}.pt'.format(conf['model'].split('/')[-1], fold)) cv.append(best_acc)
def get_optim_sched_at( model, epoch_i, num_data, lr_each_epochs=[1e-5, 1e-5, 5e-6, 3e-6], use_sched_each_epochs=[False, False, True, True], adam_decay_rate=0.01): num_train_steps = int(num_data/args.batch_size) params = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] is_backbone = lambda n: 'bert' in n optimizer_parameters = [ { 'params': [p for n, p in params if is_backbone(n) and not any(nd in n for nd in no_decay)], 'weight_decay': adam_decay_rate, 'lr': lr_each_epochs[epoch_i], }, { 'params': [p for n, p in params if is_backbone(n) and any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': lr_each_epochs[epoch_i], }, { 'params': [p for n, p in params if not is_backbone(n)], 'weight_decay': adam_decay_rate, 'lr': lr_each_epochs[epoch_i]*500, }, ] optimizer = AdamW(optimizer_parameters) lr_scheduler = None if use_sched_each_epochs[epoch_i]: lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) print('[Epoch {}, lr {}, sche {}]'.format(epoch_i, lr_each_epochs[epoch_i], lr_scheduler)) return optimizer, lr_scheduler
def make_scheduler(optimizer, decay_name='linear', t_max=None, warmup_steps=None): if decay_name == 'step': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1) elif decay_name == 'cosine': scheduler = lrs.CosineAnnealingLR(optimizer, T_max=t_max) elif decay_name == "cosine_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_max) elif decay_name == "linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_max) else: raise Exception('Unknown lr scheduler: {}'.format(decay_type)) return scheduler
def my_fancy_optimizer(warmup_proportion=0.1): num_train_optimization_steps = len(train_dataset) * params.n_epochs param_optimizer = list(model.parameters()) # param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any( # nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any( # nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] optimizer = AdamW(param_optimizer, lr = params.lr, correct_bias=True ) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = int(warmup_proportion*num_train_optimization_steps), num_training_steps = num_train_optimization_steps ) return optimizer, scheduler