def _get_logdir(self, config: Dict) -> str: timestamp = utils.get_utcnow_time() config_hash = utils.get_short_hash(config) logdir = f"{timestamp}.{config_hash}" distributed_rank = utils.get_rank() if distributed_rank > -1: logdir = f"{logdir}.rank{distributed_rank:02d}" return logdir
def objective(trial: optuna.trial): trial, trial_config = _process_trial_config(trial, config.copy()) experiment, runner, trial_config = utils.prepare_config_api_components( expdir=expdir, config=trial_config) # @TODO: here we need better solution. experiment._trial = trial # noqa: WPS437 if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(trial_config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment) return runner.best_valid_metrics[runner.main_metric]
def _init( self, log_on_batch_end: bool = False, log_on_epoch_end: bool = True, ): self.log_on_batch_end = log_on_batch_end self.log_on_epoch_end = log_on_epoch_end self.is_distributed_worker = utils.get_rank() > 0 if (self.log_on_batch_end and not self.log_on_epoch_end) \ or (not self.log_on_batch_end and self.log_on_epoch_end): self.batch_log_suffix = "" self.epoch_log_suffix = "" else: self.batch_log_suffix = "_batch" self.epoch_log_suffix = "_epoch"
def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = utils.parse_args_uargs(args, unknown_args) utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp experiment, runner, config = utils.prepare_config_api_components( expdir=Path(args.expdir), config=config ) if experiment.logdir is not None and utils.get_rank() <= 0: utils.dump_environment(config, experiment.logdir, args.configs) utils.dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]], **params) -> Optimizer: # @TODO 1: refactoring; this method is too long # @TODO 2: load state dicts for schedulers & criterion layerwise_params = params.pop("layerwise_params", OrderedDict()) no_bias_weight_decay = params.pop("no_bias_weight_decay", True) # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf lr_scaling_params = params.pop("lr_linear_scaling", None) if lr_scaling_params: data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.get("batch_size") per_gpu_scaling = data_params.get("per_gpu_scaling", False) distributed_rank = utils.get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size params["lr"] = base_lr * lr_scaling # scale default lr else: lr_scaling = 1.0 # getting model parameters model_key = params.pop("_model", None) if model_key is None: assert isinstance( model, nn.Module ), "model is key-value, but optimizer has no specified model" model_params = utils.process_model_params(model, layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, str): model_params = utils.process_model_params( model[model_key], layerwise_params, no_bias_weight_decay, lr_scaling, ) elif isinstance(model_key, (list, tuple)): model_params = [] for model_key_ in model_key: model_params_ = utils.process_model_params( model[model_key_], layerwise_params, no_bias_weight_decay, lr_scaling, ) model_params.extend(model_params_) else: raise ValueError("unknown type of model_params") load_from_previous_stage = params.pop("load_from_previous_stage", False) optimizer_key = params.pop("optimizer_key", None) optimizer = OPTIMIZERS.get_from_params(**params, params=model_params) if load_from_previous_stage and self.stages.index(stage) != 0: checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth" checkpoint = utils.load_checkpoint(checkpoint_path) dict2load = optimizer if optimizer_key is not None: dict2load = {optimizer_key: optimizer} utils.unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = utils.get_device() for param in model_params: param = param["params"][0] state = optimizer.state[param] for key, value in state.items(): state[key] = utils.any2device(value, device) # update optimizer params for key, value in params.items(): for pg in optimizer.param_groups: pg[key] = value return optimizer
def get_loaders( self, stage: str, epoch: int = None, ) -> "OrderedDict[str, DataLoader]": """Returns the loaders for a given stage.""" data_params = dict(self.stages_config[stage]["data_params"]) default_batch_size = data_params.pop("batch_size", 1) default_num_workers = data_params.pop("num_workers") drop_last = data_params.pop("drop_last", False) per_gpu_scaling = data_params.pop("per_gpu_scaling", False) distributed_rank = utils.get_rank() distributed = distributed_rank > -1 datasets = self.get_datasets(stage=stage, **data_params) overridden_loaders_params = data_params.pop("loaders_params", {}) assert isinstance( overridden_loaders_params, dict), (f"`overridden_loaders_params` should be a Dict. " f"Got: {overridden_loaders_params}") samplers_params = data_params.pop("samplers_params", {}) assert isinstance( samplers_params, dict ), f"`samplers_params` should be a Dict. Got: {samplers_params}" loaders = OrderedDict() for name, ds_ in datasets.items(): assert isinstance( ds_, (Dataset, dict)), f"{ds_} should be Dataset or Dict" overridden_loader_params = overridden_loaders_params.pop(name, {}) assert isinstance( overridden_loader_params, dict), f"{overridden_loader_params} should be Dict" sampler_params = samplers_params.pop(name, None) if sampler_params is None: if isinstance(ds_, dict) and "sampler" in ds_: sampler = ds_.pop("sampler", None) else: sampler = None else: sampler = SAMPLERS.get_from_params(**sampler_params) if isinstance(ds_, dict) and "sampler" in ds_: ds_.pop("sampler", None) batch_size = overridden_loader_params.pop("batch_size", default_batch_size) num_workers = overridden_loader_params.pop("num_workers", default_num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **overridden_loader_params, } if isinstance(ds_, Dataset): loader_params["dataset"] = ds_ elif isinstance(ds_, dict): assert ("dataset" in ds_), "You need to specify dataset for dataloader" loader_params = utils.merge_dicts(ds_, loader_params) else: raise NotImplementedError if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): sampler = DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler( dataset=loader_params["dataset"]) loader_params["shuffle"] = (name.startswith("train") and sampler is None) loader_params["sampler"] = sampler if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params[ "worker_init_fn"] = lambda x: utils.set_global_seed( self.initial_seed + x) loaders[name] = DataLoader(**loader_params) return loaders