def build_optimizer(cls, cfg, model): """ Build an optimizer from config. """ params = get_default_optimizer_params( model, base_lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY, weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, ) optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": return maybe_add_gradient_clipping(cfg, torch.optim.SGD)( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV, ) elif optimizer_type == "ADAM": return maybe_add_gradient_clipping(cfg, torch.optim.Adam)( params, cfg.SOLVER.BASE_LR) else: raise NotImplementedError(f"no optimizer type {optimizer_type}")
def build_optimizer(cls, cfg: CfgNode, model: nn.Module): params = get_default_optimizer_params( model, base_lr=cfg.SOLVER.BASE_LR, weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, overrides={ "features": { "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR, }, "embeddings": { "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR, }, }, ) optimizer = torch.optim.SGD( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV, weight_decay=cfg.SOLVER.WEIGHT_DECAY, ) return maybe_add_gradient_clipping(cfg, optimizer)
def build_optimizer(cls, cfg, model): params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for key, value in model.named_parameters(recurse=True): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if "backbone" in key: lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER params += [{ "params": [value], "lr": lr, "weight_decay": weight_decay }] optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM) elif optimizer_type == "ADAMW": optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer: """ Build an optimizer from config. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module_name, module in model.named_modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM elif key == "bias": # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer # hyperparameters are by default exactly the same as for regular # weights. lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS if 'oicr_predictors' in module_name or 'regression_branch' in module_name: logging.getLogger('detectron2').log( logging.INFO, "Setting learning rate of {} to {}".format( module_name, lr * cfg.SOLVER.REFINEMENT_LR_FACTOR)) lr = lr * cfg.SOLVER.REFINEMENT_LR_FACTOR params += [{ "params": [value], "lr": lr, "weight_decay": weight_decay }] optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV) optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cls, cfg, model): import torch import itertools from typing import Any, Dict, List, Set from detectron2.solver.build import maybe_add_gradient_clipping params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for key, value in model.named_parameters(recurse=True): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if "backbone.0" in key: lr = lr * 0.1 #cfg.SOLVER.BACKBONE_MULTIPLIER params += [{ "params": [value], "lr": lr, "weight_decay": weight_decay }] def maybe_add_full_model_gradient_clipping( optim): # optim: the optimizer class # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE enable = (cfg.SOLVER.CLIP_GRADIENTS.ENABLED and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" and clip_norm_val > 0.0) class FullModelGradientClippingOptimizer(optim): def step(self, closure=None): all_params = itertools.chain( *[x["params"] for x in self.param_groups]) torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) super().step(closure=closure) return FullModelGradientClippingOptimizer if enable else optim optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = maybe_add_full_model_gradient_clipping( torch.optim.SGD)(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM) elif optimizer_type == "ADAMW": optimizer = maybe_add_full_model_gradient_clipping( torch.optim.AdamW)(params, cfg.SOLVER.BASE_LR) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`detectron2.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module in model.modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM elif key == "bias": # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer # hyperparameters are by default exactly the same as for regular # weights. lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] assert cfg.SOLVER.OPTIM_NAME in ["RangerLars", "Adam", "SGD"] if cfg.SOLVER.OPTIM_NAME == "RangerLars": optimizer = RangerLars(params, lr=cfg.SOLVER.BASE_LR) if cfg.SOLVER.OPTIM_NAME == "Adam": optimizer = torch.optim.Adam(params, lr=cfg.SOLVER.BASE_LR) if cfg.SOLVER.OPTIM_NAME == "SGD": optimizer = torch.optim.SGD( params, lr=cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV ) optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cls, cfg, model): """ Build an optimizer from config. """ params = get_default_optimizer_params( model, weight_decay=cfg.SOLVER.WEIGHT_DECAY, weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, ) return maybe_add_gradient_clipping(cfg, torch.optim.Adam)( params, cfg.SOLVER.BASE_LR)
def build_optimizer(cls, cfg, model): """ Build an optimizer from config. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module in model.modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM elif key == "bias": lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = torch.optim.SGD( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV, ) elif optimizer_type == "ADAM": optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cls, cfg, model): params = get_default_optimizer_params( model, base_lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY, weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, ) def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE enable = ( cfg.SOLVER.CLIP_GRADIENTS.ENABLED and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" and clip_norm_val > 0.0 ) class FullModelGradientClippingOptimizer(optim): def step(self, closure=None): all_params = itertools.chain(*[x["params"] for x in self.param_groups]) torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) super().step(closure=closure) return FullModelGradientClippingOptimizer if enable else optim optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = maybe_add_gradient_clipping(torch.optim.SGD)( params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV, weight_decay=cfg.SOLVER.WEIGHT_DECAY, ) elif optimizer_type == "AdamW": optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( params, cfg.SOLVER.BASE_LR, betas=(0.9, 0.999), weight_decay=cfg.SOLVER.WEIGHT_DECAY, ) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") return optimizer
def build_adam_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer: """ Build an optimizer from config. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module in model.modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM if isinstance(module, NaiveSyncBatchNorm) or isinstance(module, MySyncBatchNorm): lr = cfg.SOLVER.BASE_SYNCBN_LR elif key == "bias": lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR) optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def build_optimizer(cls, cfg, model): """ Build an optimizer from config. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for name, _ in model.named_modules(): print(name) for name, module in model.named_modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY optimizer_name = "SGD" if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM elif key == "bias": # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer # hyperparameters are by default exactly the same as for regular # weights. lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS if "bottom_up" in name: lr = lr * cfg.SOLVER.BOTTOM_UP_MULTIPLIER elif "transformer" in name: lr = lr * cfg.SOLVER.TRANSFORMER_MULTIPLIER optimizer_name = "ADAMW" params += [{ "params": [value], "lr": lr, "weight_decay": weight_decay, "optimizer": optimizer_name }] optimizer_type = cfg.SOLVER.OPTIMIZER if optimizer_type == "SGD": optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM) elif optimizer_type == "ADAMW": optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR) elif optimizer_type == "HYBRID": optimizer = HybridOptimizer(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM) else: raise NotImplementedError(f"no optimizer type {optimizer_type}") if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer
def modify_optimizer_C4(cfg, model, train_only_weak=False, freezed_params=[]): """ Build an optimizer from config. """ norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() multi_box_head = cfg.MODEL.ROI_HEADS.MULTI_BOX_HEAD for module_name, module in model.named_modules(): for key, value in module.named_parameters(recurse=False): if not value.requires_grad: if module_name not in freezed_params: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) lr = cfg.SOLVER.BASE_LR weight_decay = cfg.SOLVER.WEIGHT_DECAY if isinstance(module, norm_module_types): weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM elif key == "bias": # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer # hyperparameters are by default exactly the same as for regular # weights. lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS if 'oicr_predictors' in module_name or 'regression_branch' in module_name: logging.getLogger('detectron2').log( logging.INFO, "Setting learning rate of {} to {}".format( module_name, lr * cfg.SOLVER.REFINEMENT_LR_FACTOR)) lr = lr * cfg.SOLVER.REFINEMENT_LR_FACTOR if 'classifier_stream' in module_name or 'detection_stream' in module_name: logging.getLogger('detectron2').log( logging.INFO, "Setting learning rate of {} to {}".format( module_name, lr * cfg.SOLVER.MIL_LR_FACTOR)) lr = lr * cfg.SOLVER.MIL_LR_FACTOR if 'cls_score_delta' in module_name or 'bbox_pred_delta' in module_name: logging.getLogger('detectron2').log( logging.INFO, "Setting learning rate of {} to {}".format( module_name, lr * cfg.SOLVER.DELTA_LR_FACTOR)) lr = lr * cfg.SOLVER.DELTA_LR_FACTOR if train_only_weak: if 'roi_heads' in module_name: if 'weak' not in module_name: if 'box_head' in module_name: if multi_box_head: value.requires_grad = False freezed_params.append(module_name) continue else: value.requires_grad = False freezed_params.append(module_name) continue else: value.requires_grad = True params += [{ "params": [value], "lr": lr, "weight_decay": weight_decay }] optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV) optimizer = maybe_add_gradient_clipping(cfg, optimizer), freezed_params return optimizer