def Ranger( params: Iterable, betas: Tuple[float, float] = (0.95, 0.999), eps: float = 1e-5, k: int = 6, alpha: float = 0.5, **kwargs ): "Convenience method for `Lookahead` with `RAdam`" return Lookahead(RAdam(params, betas=betas, eps=eps, **kwargs), alpha=alpha, k=k)
def create_optimizer(args, model, filter_bias_and_bn=True, classification_layer_name=None): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: # batch norm and bias params if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay) else: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. # reset to 0 else: if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay=0) else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamp': optimizer = AdamP(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps, delta=0.1, wd_ratio=0.01, nesterov=True) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, eps=args.opt_eps, nesterov=True) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if opt_lower == 'adamw' or opt_lower == 'radam': # compensate for the way current AdamW and RAdam optimizers # apply the weight-decay weight_decay /= args.lr if weight_decay and filter_bias_and_bn: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: parameters = model.parameters() opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: print("has weight decay and filter bias") parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: print("Comes here to unfrozen params inside optim") parameters = unfrozen_params(model) if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': print("my optimizer") optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def main(cfg: DictConfig): print('Cassava Leaf Disease Classification') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Config ------------------------------------------------------------------- data_dir = './input' seed_everything(cfg.data.seed) # Comet_ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name, auto_param_logging=False, auto_metric_logging=False) # Log Parameters experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Data Module --------------------------------------------------------------- transform = get_transforms(transform_name=cfg.data.transform, img_size=cfg.data.img_size) cv = StratifiedKFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) dm = CassavaDataModule(data_dir, cfg, transform, cv, use_merge=True, sample=DEBUG) # Model ---------------------------------------------------------------------- net = Timm_model(cfg.train.model_type, pretrained=True) # Log Model Graph experiment.set_model_graph(str(net)) # Loss fn --------------------------------------------------------------------- df = pd.read_csv('./input/merged.csv') weight = df['label'].value_counts().sort_index().tolist() weight = [w / len(df) for w in weight] weight = torch.tensor(weight).cuda() del df criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05) # Optimizer, Scheduler -------------------------------------------------------- if cfg.train.use_sam: base_optimizer = RAdam optimizer = SAM(net.parameters(), base_optimizer, lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) else: optimizer = RAdam(net.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0) # Lightning Module ------------------------------------------------------------- model = CassavaLightningSystem(net, cfg, criterion=criterion, optimizer=optimizer, scheduler=scheduler, experiment=experiment) # Trainer ------------------------------------------------------------------------- trainer = Trainer( logger=False, max_epochs=cfg.train.epoch, gpus=-1, amp_backend='apex', amp_level='O2', num_sanity_val_steps=0, # Skip Sanity Check automatic_optimization=False if cfg.train.use_sam else True, # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt' ) # Train trainer.fit(model, datamodule=dm)
def get_optimizer(model, optimizer_name, optimizer_params, scheduler_name, scheduler_params, n_epochs): opt_lower = optimizer_name.lower() opt_look_ahed = optimizer_params["lookahead"] if opt_lower == 'sgd': optimizer = optim.SGD(model.parameters(), lr=optimizer_params["lr"], momentum=optimizer_params["momentum"], weight_decay=optimizer_params["weight_decay"], nesterov=True) elif opt_lower == 'adam': optimizer = optim.Adam(model.parameters(), lr=optimizer_params["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0) elif opt_lower == 'adamw': optimizer = torch.optim.AdamW( model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == 'nadam': optimizer = torch.optim.Nadam( model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == 'radam': optimizer = RAdam(model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == "adabelief": optimizer = AdaBelief(model.parameters(), lr=optimizer_params["lr"], eps=1e-8, weight_decay=optimizer_params["weight_decay"]) elif opt_lower == "adamp": optimizer = AdamP(model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"]) else: assert False and "Invalid optimizer" raise ValueError if opt_look_ahed: optimizer = Lookahead(optimizer, alpha=0.5, k=5) if scheduler_name == "CosineAnnealingWarmRestarts": scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, eta_min=scheduler_params["eta_min"], T_0=scheduler_params["T_0"], T_mult=scheduler_params["T_multi"], ) elif scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=scheduler_params["T_max"], T_mult=scheduler_params["T_mul"], eta_min=scheduler_params["eta_min"]) elif scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=scheduler_params["schedule"], gamma=scheduler_params["gamma"]) if scheduler_params["warmup_factor"] > 0: scheduler = GradualWarmupSchedulerV2( optimizer, multiplier=scheduler_params["warmup_factor"], total_epoch=1, after_scheduler=scheduler) return optimizer, scheduler