def test_closed_form_lwca_lr_with_nz_start_lr_nz_eta_min(): seed_everything() warmup_start_lr = 0.009 base_lr = 0.07 eta_min = 0.003 warmup_epochs = 15 max_epochs = 115 multiplier = 32 test_lr_scheduler = TestLRScheduler(base_lr=base_lr, multiplier=multiplier) scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) closed_form_scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.closed_form_opt, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) test_lr_scheduler._test_against_closed_form(scheduler, closed_form_scheduler, epochs=max_epochs)
def test_closed_form_lwca_lr_with_nz_start_lr(tmpdir): seed_everything() warmup_start_lr = 0.2 base_lr = 0.8 eta_min = 0.0 warmup_epochs = 9 max_epochs = 28 multiplier = 10 test_lr_scheduler = TestLRScheduler(base_lr=base_lr, multiplier=multiplier) scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) closed_form_scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.closed_form_opt, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) test_lr_scheduler._test_against_closed_form( scheduler, closed_form_scheduler, epochs=max_epochs )
def test_closed_form_lwca_lr_with_nz_eta_min(tmpdir): reset_seed() warmup_start_lr = 0.0 base_lr = 0.04 eta_min = 0.0001 warmup_epochs = 15 max_epochs = 47 multiplier = 17 test_lr_scheduler = TestLRScheduler(base_lr=base_lr, multiplier=multiplier) scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) closed_form_scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.closed_form_opt, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) test_lr_scheduler._test_against_closed_form( scheduler, closed_form_scheduler, epochs=max_epochs )
def configure_optimizers(self): parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay ) optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.lr)) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0 ) scheduler = { 'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
def configure_optimizers(self): # TRICK 1 (Use lars + filter weights) # exclude certain parameters parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay) optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.learning_rate)) # Trick 2 (after each step) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0) scheduler = { 'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1 } if self.perc == 0.01: return [optimizer], [] else: return [optimizer], [scheduler]
def configure_optimizers(self) -> Any: """ Configures the optimizer to use for training: Adam optimizer with Lars scheduling, excluding certain parameters (batch norm and bias of convolution) from weight decay. Apply Linear Cosine Annealing schedule of learning rate with warm-up. """ # TRICK 1 (Use lars + filter weights) # exclude certain parameters parameters = self.exclude_from_wt_decay(self.online_network.named_parameters(), weight_decay=self.hparams.weight_decay) # type: ignore optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.learning_rate)) # type: ignore # Trick 2 (after each step) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch # type: ignore max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, # type: ignore max_epochs=max_epochs, warmup_start_lr=0, eta_min=self.min_learning_rate, ) scheduler = {'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1} return [optimizer], [scheduler]
def configure_optimizers(self): optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs) return [optimizer], [scheduler]
def configure_optimizers(self) -> Any: # exclude certain parameters parameters = self.exclude_from_wt_decay( self.online_network.named_parameters(), weight_decay=self.hparams.weight_decay) # type: ignore optimizer = Adam( parameters, lr=self.hparams.learning_rate, # type: ignore weight_decay=self.hparams.weight_decay) # type: ignore scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, # type: ignore max_epochs=self.hparams.max_epochs) # type: ignore return [optimizer], [scheduler]
def test_lwca_lr(tmpdir): seed_everything() warmup_start_lr = 0.0 base_lr = 0.4 eta_min = 0.0 warmup_epochs = 6 max_epochs = 15 multiplier = 10 # define target schedule targets = [] # param-group1 warmup_lr_schedule = np.linspace(warmup_start_lr, base_lr, warmup_epochs) iters = np.arange(max_epochs - warmup_epochs) cosine_lr_schedule = np.array( [ eta_min + 0.5 * (base_lr - eta_min) * ( 1 + math.cos(math.pi * t / (max_epochs - warmup_epochs)) ) for t in iters ] ) lr_schedule = np.concatenate((warmup_lr_schedule, cosine_lr_schedule)) targets.append(list(lr_schedule)) # param-group2 base_lr2 = base_lr * multiplier warmup_lr_schedule = np.linspace(warmup_start_lr, base_lr2, warmup_epochs) cosine_lr_schedule = np.array( [ eta_min + 0.5 * (base_lr2 - eta_min) * ( 1 + math.cos(math.pi * t / (max_epochs - warmup_epochs)) ) for t in iters ] ) lr_schedule = np.concatenate((warmup_lr_schedule, cosine_lr_schedule)) targets.append(list(lr_schedule)) test_lr_scheduler = TestLRScheduler(base_lr=base_lr, multiplier=multiplier) scheduler = LinearWarmupCosineAnnealingLR( optimizer=test_lr_scheduler.optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=warmup_start_lr, eta_min=eta_min, ) test_lr_scheduler._test_lr(scheduler, targets, epochs=max_epochs)
def configure_optimizers(self): lr = (self.hparams.learning_rate * (self.effective_bsz / 256)) params = list(self.encoder_online.parameters()) + \ list(self.predictor_theta_online.parameters()) + \ list(self.proj_head_online.parameters()) if self.hparams.optimiser == 'lars': models = [ self.encoder_online, self.predictor_theta_online, self.proj_head_online ] param_list = collect_params(models, exclude_bias_and_bn=True) # print(params) optimizer = LARSSGD(param_list, lr=lr, weight_decay=self.hparams.weight_decay, eta=0.001, nesterov=False) elif self.hparams.optimiser == 'adam': optimizer = Adam(params, lr=lr, weight_decay=self.hparams.weight_decay) elif self.hparams.optimiser == 'sgd': optimizer = SGD(params, lr=lr, weight_decay=self.hparams.weight_decay, momentum=0.9, nesterov=True) else: raise NotImplementedError('{} not setup.'.format( self.ft_optimiser)) scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs, warmup_start_lr=1e-3 * lr) return [optimizer], [scheduler]
def configure_optimizers(self) -> Tuple[list, list]: parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.config.opt_weight_decay) optimizer = torch.optim.Adam( parameters, lr=self.config.lr * math.sqrt(self.config.batch_size * self.config.num_of_mini_batch), ) warmup_epochs = (self.config.warmup_epochs * self.train_iters_per_epoch // self.config.num_of_mini_batch) # updating the max epochs for learning rate scheduler for fair comparision of fine-tunes and fully # supervised models. if ("lr_max_epochs" in self.config.keys() and self.config["lr_max_epochs"] is not None): max_epochs = (self.config["lr_max_epochs"] * self.train_iters_per_epoch // self.config.num_of_mini_batch) else: max_epochs = (self.trainer.max_epochs * self.train_iters_per_epoch // self.config.num_of_mini_batch) if self.config.optimizer == "LARS": optimizer = LARSWrapper(optimizer) scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0, ) else: scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs) scheduler = { "scheduler": scheduler, "interval": "step", "frequency": 1 } return [optimizer], [scheduler]
def configure_optimizers(self): # exclude certain parameters # ignore from the weight_decay, all the parameters # it looks through all the parameters in the model (e.g. encoder, projection) and do not apply weight decay to the bias and batch norm parameters = self.exclude_from_wt_decay(self.named_parameters(), weight_decay=self.hparams.opt_weight_decay) # the parameters do not include the bias or batch norm # TRICK 1 --> use LARS + filter weights optimizer = torch.optim.SGD(parameters, lr=self.hparams.lars_lr) optimizer_LARS = LARSWrapper(optimizer, eta=self.hparams.lars_eta) # TRICK 2 --> after each step # After optimizer is defined, the scheduler is then defined # The scheduler is used after each step (also known as iterations) --> warm_up_epochs x train_iter = total number of steps for warm_ups # update the learning rate every training steps (training iterations) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch # the scheduler, which is perform each step # from pl_bolts # the scheduler takes all these parameters # from the warmup_start_lr --> max learning rate of optimizer and the number of epochs spcified for warmups --> cosine decay for the remainder of the epochs linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs, warmup_start_lr=0, # final learning rate eta_min=0 ) # use a dictionary to define the scheduler for each step (pytorch lightning) # default pytorch lightning updates scheduler every epoch, can overwrite it as shown below scheduler = { 'scheduler': linear_warmup_cosine_decay, 'interval': 'step', # every 1 step # if value change to 5, it means the scheduler will update every 5 steps 'frequency': 1 } # return an array because you can have multiple optimizers or schedulers return [optimizer], [scheduler]
def configure_optimizers(self): parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay) optimizer = Adam(parameters, lr=self.hparams.lr) optimizer = LARSWrapper(optimizer) # Trick 2 (after each step) linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs, warmup_start_lr=0, eta_min=0, ) scheduler = { "scheduler": linear_warmup_cosine_decay, "interval": "step", "frequency": 1, } return [optimizer], [scheduler]
def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=5e-4) scheduler = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=1, max_epochs=40) return [optimizer], [scheduler]