def configure_optimizers(self): if self.exclude_bn_bias: params = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.weight_decay ) else: params = self.parameters() if self.optim == 'sgd': optimizer = torch.optim.SGD( params, lr=self.learning_rate, momentum=0.9, weight_decay=self.weight_decay ) elif self.optim == 'adam': optimizer = torch.optim.Adam( params, lr=self.learning_rate, weight_decay=self.weight_decay ) if self.lars_wrapper: optimizer = LARSWrapper( optimizer, eta=0.001, # trust coefficient clip=False ) return optimizer
def configure_optimizers(self): parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay ) optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.lr)) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0 ) scheduler = { 'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
def configure_optimizers(self): # TRICK 1 (Use lars + filter weights) # exclude certain parameters parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay) optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.learning_rate)) # Trick 2 (after each step) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0) scheduler = { 'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1 } if self.perc == 0.01: return [optimizer], [] else: return [optimizer], [scheduler]
def configure_optimizers(self) -> Any: """ Configures the optimizer to use for training: Adam optimizer with Lars scheduling, excluding certain parameters (batch norm and bias of convolution) from weight decay. Apply Linear Cosine Annealing schedule of learning rate with warm-up. """ # TRICK 1 (Use lars + filter weights) # exclude certain parameters parameters = self.exclude_from_wt_decay(self.online_network.named_parameters(), weight_decay=self.hparams.weight_decay) # type: ignore optimizer = LARSWrapper(Adam(parameters, lr=self.hparams.learning_rate)) # type: ignore # Trick 2 (after each step) self.hparams.warmup_epochs = self.hparams.warmup_epochs * self.train_iters_per_epoch # type: ignore max_epochs = self.trainer.max_epochs * self.train_iters_per_epoch linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, # type: ignore max_epochs=max_epochs, warmup_start_lr=0, eta_min=self.min_learning_rate, ) scheduler = {'scheduler': linear_warmup_cosine_decay, 'interval': 'step', 'frequency': 1} return [optimizer], [scheduler]
def configure_optimizers(self): if self.exclude_bn_bias: params = self.exclude_from_wt_decay(self.named_parameters(), weight_decay=self.weight_decay) else: params = self.parameters() predictor_prefix = ('encoder') backbone_and_encoder_parameters = [param for name, param in self.online_network.encoder.named_parameters()] backbone_and_encoder_parameters+= [param for name, param in self.online_network.projector.named_parameters()] lr = self.learning_rate params = [{ 'name': 'base', 'params': backbone_and_encoder_parameters, 'lr': lr },{ 'name': 'predictor', 'params': [param for name, param in self.online_network.predictor.named_parameters()], 'lr': lr }] if self.optim == 'sgd': optimizer = torch.optim.SGD(params, lr=self.learning_rate, momentum=0.9, weight_decay=self.weight_decay) elif self.optim == 'adam': optimizer = torch.optim.Adam(params, lr=self.learning_rate, weight_decay=self.weight_decay) if self.lars_wrapper: optimizer = LARSWrapper( optimizer, eta=0.001, # trust coefficient clip=False ) return optimizer
def configure_optimizers(self): optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) optimizer = LARSWrapper(optimizer) scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs ) return [optimizer], [scheduler]
def configure_optimizers(self) -> Tuple[list, list]: parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.config.opt_weight_decay) optimizer = torch.optim.Adam( parameters, lr=self.config.lr * math.sqrt(self.config.batch_size * self.config.num_of_mini_batch), ) warmup_epochs = (self.config.warmup_epochs * self.train_iters_per_epoch // self.config.num_of_mini_batch) # updating the max epochs for learning rate scheduler for fair comparision of fine-tunes and fully # supervised models. if ("lr_max_epochs" in self.config.keys() and self.config["lr_max_epochs"] is not None): max_epochs = (self.config["lr_max_epochs"] * self.train_iters_per_epoch // self.config.num_of_mini_batch) else: max_epochs = (self.trainer.max_epochs * self.train_iters_per_epoch // self.config.num_of_mini_batch) if self.config.optimizer == "LARS": optimizer = LARSWrapper(optimizer) scheduler = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=warmup_epochs, max_epochs=max_epochs, warmup_start_lr=0, eta_min=0, ) else: scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs) scheduler = { "scheduler": scheduler, "interval": "step", "frequency": 1 } return [optimizer], [scheduler]
def configure_optimizers(self): parameters = self.exclude_from_wt_decay( self.named_parameters(), weight_decay=self.hparams.opt_weight_decay) optimizer = Adam(parameters, lr=self.hparams.lr) optimizer = LARSWrapper(optimizer) # Trick 2 (after each step) linear_warmup_cosine_decay = LinearWarmupCosineAnnealingLR( optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs, warmup_start_lr=0, eta_min=0, ) scheduler = { "scheduler": linear_warmup_cosine_decay, "interval": "step", "frequency": 1, } return [optimizer], [scheduler]
def model(dict_params, _server, id): s = 1 color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) transform_train = transforms.Compose([ transforms.Resize((250, 250)), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomApply([color_jitter], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.GaussianBlur(23, sigma=(0.1, 2.0)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # basic transformation on test images transform_test = transforms.Compose([ transforms.Resize((224, 224)), # resize image transforms.ToTensor(), # numpy array to tensor transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # normalize the image ]) batch_size = dict_params["batch_size"] dataset = Cub2011(root="CUB", train=True, transform=transform_train, all=True, augment=True) dataset_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) # train-set with augmentations raw_train_dataset = Cub2011(root="CUB", train=True, transform=transform_test, augment=False) # set train loader with augmentations raw_train_loader = DataLoader(raw_train_dataset, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True) # test-set raw_test_dataset = Cub2011(root="CUB", train=False, transform=transform_test, augment=False) # test loader raw_test_loader = DataLoader(raw_test_dataset, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True) num_epochs = dict_params["epochs"] if isinstance(dict_params["lr"], float) is True: lr = dict_params["lr"] elif dict_params["lr"] == "1": lr = float(0.3 * batch_size / 256) elif dict_params["lr"] == "2": lr = float(0.075 * math.sqrt(batch_size)) else: # like 1 lr = float(0.3 * batch_size / 256) devices = [7, 6] if _server == "dgx" else [3, 1, 0, 2] # Create new instance of the model simclr_model = SimCLRModel(PRE) simclr_model = nn.DataParallel(simclr_model, device_ids=devices, output_device=devices[0]) # LARS optimizer base_optimizer = optim.SGD(simclr_model.parameters(), lr=lr, momentum=0.9) simclr_opt = LARSWrapper(base_optimizer, eta=dict_params["eta"], clip=True, eps=1e-8) train_features, train_labels, test_features, test_labels = \ self_supervised_training(simclr_model, simclr_opt, dataset_loader, raw_train_loader, raw_test_loader, dict_params, id, devices=devices, num_epochs=num_epochs, print_freq=30, device="cuda:{}".format(devices[0]), temperature=dict_params["temperature"], pre=PRE) return train_features, train_labels, test_features, test_labels