def __init__(self, context: PyTorchTrialContext) -> None: self.context = context self.hparams = AttrDict(self.context.get_hparams()) # If backend is local download data in rank 0 slot. if self.hparams.backend == "local": if self.context.distributed.get_local_rank() == 0: if not all([ os.path.isdir(os.path.join(self.hparams.data_dir, d)) for d in ["train2017", "val2017"] ]): download_coco_from_source(self.hparams.data_dir) else: # Other slots wait until rank 0 is done downloading, which will # correspond to the head writing a done.txt file. while not os.path.isfile( os.path.join(self.hparams.data_dir, "done.txt")): time.sleep(10) # Build the model and configure postprocessors for evaluation. model, self.criterion, self.postprocessors = build_model( self.hparams, world_size=self.context.distributed.get_size()) self.model = self.context.wrap_model(model) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in self.model.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in self.model.named_parameters() if "backbone" in n and p.requires_grad ], "lr": self.hparams.lr_backbone, }, ] self.optimizer = self.context.wrap_optimizer( torch.optim.AdamW(param_dicts, lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)) # Wrap the LR scheduler. self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.StepLR(self.optimizer, self.hparams.lr_drop), step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH, ) self.clip_grads_fn = (lambda x: torch.nn.utils.clip_grad_norm_( x, self.hparams.clip_max_norm) if self.hparams.clip_max_norm > 0 else None)
def __init__(self, context: PyTorchTrialContext) -> None: self.context = context self.hparams = AttrDict(self.context.get_hparams()) # If backend is local download data. if self.hparams.backend == "local": # Use a file lock so only one worker on each node does the download. with filelock.FileLock( os.path.join(self.hparams.data_dir, "download.lock")): if not all([ os.path.isdir(os.path.join(self.hparams.data_dir, d)) for d in ["train2017", "val2017"] ]): download_coco_from_source(self.hparams.data_dir) self.cat_ids = [] # Build the model and configure postprocessors for evaluation. model, self.criterion, self.postprocessors = build_model( self.hparams, world_size=self.context.distributed.get_size()) # Load checkpoint from DETR repo. if "warmstart" in self.hparams and self.hparams.warmstart: checkpoint = torch.hub.load_state_dict_from_url( url="https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth", map_location="cpu", check_hash=True, ) # Remove class weights if finetuning. if "cat_ids" in self.hparams and len(self.hparams.cat_ids): del checkpoint["model"]["class_embed.weight"] del checkpoint["model"]["class_embed.bias"] model.load_state_dict(checkpoint["model"], strict=False) self.model = self.context.wrap_model(model) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in self.model.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in self.model.named_parameters() if "backbone" in n and p.requires_grad ], "lr": self.hparams.lr_backbone, }, ] self.optimizer = self.context.wrap_optimizer( torch.optim.AdamW(param_dicts, lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)) # Wrap the LR scheduler. self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.StepLR(self.optimizer, self.hparams.lr_drop), step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH, ) self.clip_grads_fn = (lambda x: torch.nn.utils.clip_grad_norm_( x, self.hparams.clip_max_norm) if self.hparams.clip_max_norm > 0 else None)
def __init__(self, context: PyTorchTrialContext) -> None: self.context = context self.hparams = AttrDict(self.context.get_hparams()) # If backend is local download data in rank 0 slot. if self.hparams.backend == "local": if self.context.distributed.get_local_rank() == 0: if not all([ os.path.isdir(os.path.join(self.hparams.data_dir, d)) for d in ["train2017", "val2017"] ]): download_coco_from_source(self.hparams.data_dir) else: # Other slots wait until rank 0 is done downloading, which will # correspond to the head writing a done.txt file. while not os.path.isfile( os.path.join(self.hparams.data_dir, "done.txt")): time.sleep(10) self.cat_ids = [] # Build the model and configure postprocessors for evaluation. model, self.criterion, self.postprocessors = build_model( self.hparams, world_size=self.context.distributed.get_size()) # Load pretrained weights downloaded in the startup-hook.sh from # the original repo. if "warmstart" in self.hparams and self.hparams.warmstart: checkpoint = torch.load("model.ckpt") ckpt = checkpoint["model"] # Remove class weights if finetuning. if "cat_ids" in self.hparams and len(self.hparams.cat_ids): delete_keys = [k for k in ckpt if "class_embed" in k] for k in delete_keys: del ckpt[k] model.load_state_dict(ckpt, strict=False) self.model = self.context.wrap_model(model) n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in self.model.named_parameters() if not match_name_keywords(n, self.hparams.lr_backbone_names) and not match_name_keywords( n, self.hparams.lr_linear_proj_names) and p.requires_grad ], "lr": self.hparams.lr, }, { "params": [ p for n, p in self.model.named_parameters() if match_name_keywords(n, self.hparams.lr_backbone_names) and p.requires_grad ], "lr": self.hparams.lr_backbone, }, { "params": [ p for n, p in self.model.named_parameters() if match_name_keywords(n, self.hparams.lr_linear_proj_names ) and p.requires_grad ], "lr": self.hparams.lr * self.hparams.lr_linear_proj_mult, }, ] if self.hparams.sgd: self.optimizer = self.context.wrap_optimizer( torch.optim.SGD( param_dicts, lr=self.hparams.lr, momentum=0.9, weight_decay=self.hparams.weight_decay, )) else: self.optimizer = self.context.wrap_optimizer( torch.optim.AdamW( param_dicts, lr=self.hparams.lr, weight_decay=self.hparams.weight_decay, )) # Wrap the LR scheduler. self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.StepLR(self.optimizer, self.hparams.lr_drop), step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH, ) self.clip_grads_fn = (lambda x: torch.nn.utils.clip_grad_norm_( x, self.hparams.clip_max_norm) if self.hparams.clip_max_norm > 0 else None)