def create_optimizer(args, model, filter_bias_and_bn=True, classification_layer_name=None): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: # batch norm and bias params if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay) else: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. # reset to 0 else: if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay=0) else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamp': optimizer = AdamP(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps, delta=0.1, wd_ratio=0.01, nesterov=True) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, eps=args.opt_eps, nesterov=True) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if opt_lower == 'adamw' or opt_lower == 'radam': # compensate for the way current AdamW and RAdam optimizers # apply the weight-decay weight_decay /= args.lr if weight_decay and filter_bias_and_bn: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: parameters = model.parameters() opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: print("has weight decay and filter bias") parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: print("Comes here to unfrozen params inside optim") parameters = unfrozen_params(model) if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': print("my optimizer") optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def main(): # setup config cfg = config() cfg['device'] = torch.device( "cuda" if torch.cuda.is_available() else "cpu") timestr = time.strftime("%Y%m%d-%H%M%S") cfg['logdir'] += f"{cfg['arch']}_" cfg['logdir'] += f"{cfg['exp_idx']}_" cfg['logdir'] += f"{cfg['input_size']}_" cfg['logdir'] += f"{cfg['criterion']}_" cfg['logdir'] += f"{cfg['optimizer']}_" cfg['logdir'] += f"split{cfg['data_split']}_" cfg['logdir'] += timestr set_global_seed(cfg['random_state']) pprint(cfg) # load data train_df = pd.read_csv(cfg['train_csv_path']) test_df = pd.read_csv(cfg['test_csv_path']) print(len(train_df), len(test_df)) train_img_weights = compute_dataset_weights(train_df) train_transforms, test_transforms = get_transforms(cfg['input_size']) train_dataset = LeafDataset( img_root=cfg['img_root'], df=train_df, img_transforms=train_transforms, is_train=True, ) test_dataset = LeafDataset( img_root=cfg['img_root'], df=test_df, img_transforms=test_transforms, is_train=False, ) print( f"Training set size:{len(train_dataset)}, Test set size:{len(test_dataset)}") # prepare train and test loader if cfg['sampling'] == 'weighted': # image weight based on statistics train_img_weights = compute_dataset_weights(train_df) # weighted sampler weighted_sampler = WeightedRandomSampler( weights=train_img_weights, num_samples=len(train_img_weights), replacement=False) # batch sampler from weigted sampler batch_sampler = BatchSampler( weighted_sampler, batch_size=cfg['batch_size'], drop_last=True) # train loader train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=4) elif cfg['sampling'] == 'normal': train_loader = DataLoader( train_dataset, cfg['batch_size'], shuffle=True, num_workers=2) test_loader = DataLoader( test_dataset, cfg['test_batch_size'], shuffle=False, num_workers=1, drop_last=True) loaders = { 'train': train_loader, 'valid': test_loader } # model setup model = timm.create_model(model_name=cfg['arch'], num_classes=len( cfg['class_names']), drop_rate=0.5, pretrained=True) model.train() # loss if cfg['criterion'] == 'label_smooth': criterion = LabelSmoothingCrossEntropy() elif cfg['criterion'] == 'cross_entropy': criterion = nn.CrossEntropyLoss() # optimizer if cfg['optimizer'] == 'adam': optimizer = torch.optim.Adam( model.parameters(), lr=cfg['lr'], weight_decay=cfg['wd']) elif cfg['optimizer'] == 'adamw': optimizer = AdamW( model.parameters(), lr=cfg['lr'], weight_decay=cfg['wd']) elif cfg['optimizer'] == 'radam': optimizer = RAdam( model.parameters(), lr=cfg['lr'], weight_decay=cfg['wd']) # learning schedule if cfg['lr_schedule'] == 'reduce_plateau': scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.5, patience=4) # trainer runner = SupervisedRunner(device=cfg['device']) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[ AccuracyCallback( num_classes=len(cfg['class_names']), threshold=0.5, activation="Softmax" ), ], logdir=cfg['logdir'], num_epochs=cfg['num_epochs'], verbose=cfg['verbose'], # set this true to run for 3 epochs only check=cfg['check'], )
def create_optimizer(optimizer_config, model, master_params=None): if optimizer_config.get("classifier_lr", -1) != -1: # Separate classifier parameters from all others net_params = [] classifier_params = [] for k, v in model.named_parameters(): if not v.requires_grad: continue if k.find("encoder") != -1: net_params.append(v) else: classifier_params.append(v) params = [ { "params": net_params }, { "params": classifier_params, "lr": optimizer_config["classifier_lr"] }, ] else: if master_params: params = master_params else: params = model.parameters() if optimizer_config["type"] == "SGD": optimizer = optim.SGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], weight_decay=optimizer_config["weight_decay"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "FusedSGD": optimizer = FusedSGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], weight_decay=optimizer_config["weight_decay"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "Adam": optimizer = optim.Adam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "FusedAdam": optimizer = FusedAdam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "AdamW": optimizer = AdamW(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "RmsProp": optimizer = RMSprop(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) else: raise KeyError("unrecognized optimizer {}".format( optimizer_config["type"])) if optimizer_config["schedule"]["type"] == "step": scheduler = LRStepScheduler(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "clr": scheduler = CyclicLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "multistep": scheduler = MultiStepLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "exponential": scheduler = ExponentialLRScheduler( optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "poly": scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "constant": scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0) elif optimizer_config["schedule"]["type"] == "linear": def linear_lr(it): return it * optimizer_config["schedule"]["params"][ "alpha"] + optimizer_config["schedule"]["params"]["beta"] scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr) return optimizer, scheduler
def create_optimizer(optimizer_config, model, master_params=None): """Creates optimizer and schedule from configuration Parameters ---------- optimizer_config : dict Dictionary containing the configuration options for the optimizer. model : Model The network model. Returns ------- optimizer : Optimizer The optimizer. scheduler : LRScheduler The learning rate scheduler. """ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', "_bn0.weight", "_bn1.weight", "_bn2.weight"] def make_params(param_optimizer, lr=None): params = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': optimizer_config["weight_decay"]}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] for p in params: if lr is not None: p["lr"] = lr return params if optimizer_config.get("classifier_lr", -1) != -1: # Separate classifier parameters from all others net_params = [] classifier_params = [] for k, v in model.named_parameters(): if not v.requires_grad: continue if k.find("encoder") != -1: net_params.append((k, v)) else: classifier_params.append((k, v)) params = [] params.extend(make_params(classifier_params, optimizer_config["classifier_lr"])) params.extend(make_params(net_params)) print("param_groups", len(params)) else: param_optimizer = list(model.named_parameters()) params = make_params(param_optimizer) print("param_groups", len(params)) if optimizer_config["type"] == "SGD": optimizer = optim.SGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "Adam": optimizer = optim.Adam(params, eps=optimizer_config.get("eps", 1e-8), lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "FusedAdam": optimizer = FusedAdam(params, eps=optimizer_config.get("eps", 1e-8), lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "FusedNovoGrad": optimizer = FusedNovoGrad(params, eps=optimizer_config.get("eps", 1e-8), lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "AdamW": optimizer = AdamW(params, eps=optimizer_config.get("eps", 1e-8), lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "RmsProp": optimizer = RMSprop(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) else: raise KeyError("unrecognized optimizer {}".format(optimizer_config["type"])) if optimizer_config["schedule"]["type"] == "step": scheduler = LRStepScheduler(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "cosine": scheduler = CosineAnnealingLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "clr": scheduler = CyclicLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "multistep": scheduler = MultiStepLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "exponential": scheduler = ExponentialLRScheduler(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "poly": scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "constant": scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0) elif optimizer_config["schedule"]["type"] == "linear": def linear_lr(it): return it * optimizer_config["schedule"]["params"]["alpha"] + optimizer_config["schedule"]["params"]["beta"] scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr) return optimizer, scheduler