def filter_callbacks_by_node( callbacks: Union[Dict, OrderedDict]) -> Union[Dict, OrderedDict]: """ Filters callbacks based on running node. Deletes worker-only callbacks from ``CallbackNode.Master`` and master-only callbacks from ``CallbackNode.Worker``. Args: callbacks (Union[Dict, OrderedDict]): callbacks Returns: Union[Dict, OrderedDict]: filtered callbacks dictionary. """ # distributed run setting output = callbacks.copy() rank = get_rank() if rank == 0: # master node # remove worker-only callbacks on master node for k in list( filter(lambda c: output[c].node == CallbackNode.worker, output)): del output[k] elif rank > 0: # worker node # remove master-only callbacks on worker nodes for k in list( filter(lambda c: output[c].node == CallbackNode.master, output)): del output[k] return output
def do_lr_linear_scaling(lr_scaling_params, batch_size: int, per_gpu_scaling: bool) -> Tuple[float, float]: """ Linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf Args: lr_scaling_params: config parameters of lr linear scaling batch_size: batch size per_gpu_scaling: per-gpu-scaling flag Returns: lr, lr_scaling """ distributed_rank = get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size lr = base_lr * lr_scaling # scale default lr return lr, lr_scaling
def validate_loaders(loaders: Dict[str, DataLoader]) -> Dict[str, DataLoader]: """ Check pytorch dataloaders for distributed setup. Transfers them to distirbuted mode if necessary. (Experimental feature) Args: loaders (Dict[str, DataLoader]): dictionery with pytorch dataloaders Returns: Dict[str, DataLoader]: dictionery with pytorch dataloaders (with distributed samplers if necessary) """ from catalyst.data.sampler import DistributedSamplerWrapper rank = get_rank() if rank >= 0: for key, value in loaders.items(): if not isinstance(value.sampler, (DistributedSampler, DistributedSamplerWrapper)): warnings.warn( "With distributed training setup, " "you need ``DistributedSampler`` for your ``DataLoader``.") # loaders[key] = _force_make_distributed_loader(value) return loaders
def reset(self, num_batches: int, num_samples: int) -> None: """ Reset metrics fields Args: num_batches: expected number of batches num_samples: expected number of samples to accumulate """ super().reset(num_batches, num_samples) assert get_rank() < 0, "No DDP support implemented yet"
def __init__(self, compute_on_call: bool = True, prefix: str = None, suffix: str = None): """Init.""" super().__init__(compute_on_call=compute_on_call, prefix=prefix, suffix=suffix) self.metric_name = f"{self.prefix}auc{self.suffix}" self.scores = [] self.targets = [] self._is_ddp = get_rank() > -1
def config_main(args, unknown_args): """Yaml config catalyst-dl run entry point.""" args, config = parse_args_uargs(args, unknown_args) set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) runner: ConfigRunner = get_config_runner(expdir=args.expdir, config=config) if get_rank() <= 0: dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs) dump_code(expdir=args.expdir, logdir=runner.logdir) runner.run()
def objective(trial: optuna.trial): trial, trial_config = _process_trial_config(trial, config.copy()) runner: ConfigRunner = get_config_runner(expdir=Path(args.expdir), config=trial_config) # @TODO: here we need better solution. runner._trial = trial # noqa: WPS437 if get_rank() <= 0: dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs) dump_code(expdir=args.expdir, logdir=runner.logdir) runner.run() return trial.best_score
def objective(trial: optuna.trial): trial, trial_config = _process_trial_config(trial, config.copy()) experiment, runner, trial_config = prepare_config_api_components( expdir=expdir, config=trial_config) # @TODO: here we need better solution. experiment._trial = trial # noqa: WPS437 if experiment.logdir is not None and get_rank() <= 0: dump_environment(trial_config, experiment.logdir, args.configs) dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment) return runner.best_valid_metrics[runner.main_metric]
def main_worker(cfg: DictConfig): set_global_seed(cfg.args.seed) prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark) import_module(hydra.utils.to_absolute_path(cfg.args.expdir)) experiment = hydra.utils.instantiate(cfg.experiment, cfg=cfg) runner = hydra.utils.instantiate(cfg.runner) if experiment.logdir is not None and get_rank() <= 0: dump_environment(cfg, experiment.logdir) dump_code( hydra.utils.to_absolute_path(cfg.args.expdir), experiment.logdir ) runner.run_experiment(experiment)
def main_worker(args, unknown_args): """Runs main worker thread from model training.""" args, config = parse_args_uargs(args, unknown_args) set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) config.setdefault("distributed_params", {})["apex"] = args.apex config.setdefault("distributed_params", {})["amp"] = args.amp experiment, runner, config = prepare_config_api_components(expdir=Path( args.expdir), config=config) if experiment.logdir is not None and get_rank() <= 0: dump_environment(config, experiment.logdir, args.configs) dump_code(args.expdir, experiment.logdir) runner.run_experiment(experiment)
def main(cfg: DictConfig): """ Hydra config catalyst-dl run entry point Args: cfg: (DictConfig) configuration """ cfg = prepare_hydra_config(cfg) set_global_seed(cfg.args.seed) prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark) import_module(hydra.utils.to_absolute_path(cfg.args.expdir)) runner = hydra.utils.instantiate(cfg.runner, cfg=cfg) if get_rank() <= 0: dump_environment(logdir=runner.logdir, config=cfg) dump_code(expdir=hydra.utils.to_absolute_path(cfg.args.expdir), logdir=runner.logdir) runner.run()
def _get_loader( dataset: Dataset, sampler: Sampler, initial_seed: int, params: DictConfig, ) -> DataLoader: params = OmegaConf.to_container(params, resolve=True) per_gpu_scaling = params.pop("per_gpu_scaling", False) params["dataset"] = dataset distributed_rank = get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) assert ("batch_size" in params), "loader config must contain 'batch_size' key" assert ("num_workers" in params), "loader config must contain 'num_workers' key" params["batch_size"] *= num_gpus params["num_workers"] *= num_gpus if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): sampler = DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler(dataset=params["dataset"]) params["shuffle"] = params.get("shuffle", False) and sampler is None params["sampler"] = sampler worker_init_fn = params.pop("worker_init_fn", None) if worker_init_fn is None: params["worker_init_fn"] = lambda x: set_global_seed(initial_seed + x) else: params["worker_init_fn"] = hydra.utils.get_method(worker_init_fn) collate_fn = params.pop("collate_fn", None) if collate_fn is None: params["collate_fn"] = None else: params["collate_fn"] = hydra.utils.get_method(collate_fn) loader: DataLoader = DataLoader(**params) return loader
def on_loader_end(self, runner: IRunner): eps = 1e-7 ious_per_image = [] # Gather statistics from all nodes all_gathered_scores_per_image = all_gather(self.scores_per_image) n = len(self.thresholds) all_scores_per_image = defaultdict(lambda: { "intersection": np.zeros(n), "union": np.zeros(n) }) for scores_per_image in all_gathered_scores_per_image: for image_id, values in scores_per_image.items(): all_scores_per_image[image_id]["intersection"] += values[ "intersection"] all_scores_per_image[image_id]["union"] += values["union"] for image_id, values in all_scores_per_image.items(): intersection = values["intersection"] union = values["union"] metric = intersection / (union + eps) ious_per_image.append(metric) thresholds = to_numpy(self.thresholds) iou = np.mean(ious_per_image, axis=0) assert len(iou) == len(thresholds) threshold_index = np.argmax(iou) iou_at_threshold = iou[threshold_index] threshold_value = thresholds[threshold_index] runner.loader_metrics[self.prefix + "/" + "threshold"] = float(threshold_value) runner.loader_metrics[self.prefix] = float(iou_at_threshold) if get_rank() in {-1, 0}: logger = get_tensorboard_logger(runner) logger.add_histogram(self.prefix, iou, global_step=runner.epoch)
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device. Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 method device (Device, optional): device Returns: tuple with processed model, criterion, optimizer, scheduler and device. Raises: NotImplementedError: if model is not nn.Module or dict for multi-gpu, nn.ModuleDict for DataParallel not implemented yet """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None: device = get_device() elif isinstance(device, str): device = torch.device(device) is_apex_available = (distributed_params.pop("apex", True) and check_apex_available()) model: Model = maybe_recursive_call(model, "to", device=device) if check_ddp_wrapped(model): pass # distributed data parallel run (ddp) (with apex support) elif get_rank() >= 0: assert isinstance( model, nn.Module), "Distributed training is not available for KV model" local_rank = distributed_params.pop("local_rank", 0) or 0 device = f"cuda:{local_rank}" model = maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) if is_apex_available: import apex model, optimizer = initialize_apex(model, optimizer, **distributed_params) model = apex.parallel.DistributedDataParallel(model) if syncbn: model = apex.parallel.convert_syncbn_model(model) else: model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) # data parallel run (dp) (with apex support) else: # apex issue https://github.com/deepset-ai/FARM/issues/210 use_apex = (is_apex_available and torch.cuda.device_count() == 1) or ( is_apex_available and torch.cuda.device_count() > 1 and distributed_params.get("opt_level", "O0") == "O1") if use_apex: assert isinstance( model, nn.Module), "Apex training is not available for KV model" model, optimizer = initialize_apex(model, optimizer, **distributed_params) if (torch.cuda.device_count() > 1 and device.type != "cpu" and device.index is None): if isinstance(model, nn.Module): model = nn.DataParallel(model) elif isinstance(model, dict): model = {k: nn.DataParallel(v) for k, v in model.items()} else: raise NotImplementedError() model: Model = maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def get_loaders_from_params( batch_size: int = 1, num_workers: int = 0, drop_last: bool = False, per_gpu_scaling: bool = False, loaders_params: Dict[str, Any] = None, samplers_params: Dict[str, Any] = None, initial_seed: int = 42, get_datasets_fn: Callable = None, **data_params, ) -> "OrderedDict[str, DataLoader]": """ Creates pytorch dataloaders from datasets and additional parameters. Args: batch_size: ``batch_size`` parameter from ``torch.utils.data.DataLoader`` num_workers: ``num_workers`` parameter from ``torch.utils.data.DataLoader`` drop_last: ``drop_last`` parameter from ``torch.utils.data.DataLoader`` per_gpu_scaling: boolean flag, if ``True``, uses ``batch_size=batch_size*num_available_gpus`` loaders_params (Dict[str, Any]): additional loaders parameters samplers_params (Dict[str, Any]): additional sampler parameters initial_seed: initial seed for ``torch.utils.data.DataLoader`` workers get_datasets_fn(Callable): callable function to get dictionary with ``torch.utils.data.Datasets`` **data_params: additional data parameters or dictionary with ``torch.utils.data.Datasets`` to use for pytorch dataloaders creation Returns: OrderedDict[str, DataLoader]: dictionary with ``torch.utils.data.DataLoader`` Raises: NotImplementedError: if datasource is out of `Dataset` or dict ValueError: if batch_sampler option is mutually exclusive with distributed """ from catalyst.data.sampler import DistributedSamplerWrapper default_batch_size = batch_size default_num_workers = num_workers loaders_params = loaders_params or {} assert isinstance( loaders_params, dict), f"`loaders_params` should be a Dict. " f"Got: {loaders_params}" samplers_params = samplers_params or {} assert isinstance( samplers_params, dict), f"`samplers_params` should be a Dict. Got: {samplers_params}" distributed_rank = get_rank() distributed = distributed_rank > -1 if get_datasets_fn is not None: datasets = get_datasets_fn(**data_params) else: datasets = dict(**data_params) loaders = OrderedDict() for name, datasource in datasets.items(): # noqa: WPS426 assert isinstance( datasource, (Dataset, dict )), f"{datasource} should be Dataset or Dict. Got: {datasource}" loader_params = loaders_params.pop(name, {}) assert isinstance(loader_params, dict), f"{loader_params} should be Dict" sampler_params = samplers_params.pop(name, None) if sampler_params is None: if isinstance(datasource, dict) and "sampler" in datasource: sampler = datasource.pop("sampler", None) else: sampler = None else: sampler = SAMPLER.get_from_params(**sampler_params) if isinstance(datasource, dict) and "sampler" in datasource: datasource.pop("sampler", None) batch_size = loader_params.pop("batch_size", default_batch_size) num_workers = loader_params.pop("num_workers", default_num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **loader_params, } if isinstance(datasource, Dataset): loader_params["dataset"] = datasource elif isinstance(datasource, dict): assert "dataset" in datasource, "You need to specify dataset for dataloader" loader_params = merge_dicts(datasource, loader_params) else: raise NotImplementedError if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): sampler = DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler(dataset=loader_params["dataset"]) loader_params["shuffle"] = name.startswith("train") and sampler is None loader_params["sampler"] = sampler if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params["worker_init_fn"] = lambda x: set_global_seed( initial_seed + x) loaders[name] = DataLoader(**loader_params) return loaders
def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]], **params) -> Optimizer: # @TODO 1: refactoring; this method is too long # @TODO 2: load state dicts for schedulers & criterion layerwise_params = params.pop("layerwise_params", OrderedDict()) no_bias_weight_decay = params.pop("no_bias_weight_decay", True) # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf lr_scaling_params = params.pop("lr_linear_scaling", None) if lr_scaling_params: data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.get("batch_size") per_gpu_scaling = data_params.get("per_gpu_scaling", False) distributed_rank = get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size params["lr"] = base_lr * lr_scaling # scale default lr else: lr_scaling = 1.0 # getting model parameters model_key = params.pop("_model", None) if model_key is None: assert isinstance( model, nn.Module ), "model is key-value, but optimizer has no specified model" model_params = process_model_params(model, layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, str): model_params = process_model_params( model[model_key], layerwise_params, no_bias_weight_decay, lr_scaling, ) elif isinstance(model_key, (list, tuple)): model_params = [] for model_key_el in model_key: model_params_el = process_model_params( model[model_key_el], layerwise_params, no_bias_weight_decay, lr_scaling, ) model_params.extend(model_params_el) else: raise ValueError("unknown type of model_params") load_from_previous_stage = params.pop("load_from_previous_stage", False) optimizer_key = params.pop("optimizer_key", None) optimizer = OPTIMIZERS.get_from_params(**params, params=model_params) if load_from_previous_stage and self.stages.index(stage) != 0: checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth" checkpoint = load_checkpoint(checkpoint_path) dict2load = optimizer if optimizer_key is not None: dict2load = {optimizer_key: optimizer} unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = get_device() for param in model_params: param = param["params"][0] optimizer_state = optimizer.state[param] for state_key, state_value in optimizer_state.items(): optimizer_state[state_key] = any2device( state_value, device) # update optimizer params for key, value in params.items(): for optimizer_param_group in optimizer.param_groups: optimizer_param_group[key] = value return optimizer
def reset(self, num_batches, num_samples) -> None: """Resets all fields""" self._is_ddp = get_rank() > -1 self.scores = [] self.targets = []
def get_loaders_from_params( batch_size: int = 1, num_workers: int = 0, drop_last: bool = False, per_gpu_scaling: bool = False, loaders_params: Dict[str, Any] = None, samplers: "OrderedDict[str, Sampler]" = None, datasets: "OrderedDict[str, Union[Dataset, dict]]" = None, initial_seed: int = 42, ) -> "OrderedDict[str, DataLoader]": """ Creates pytorch dataloaders from datasets and additional parameters. Args: batch_size: ``batch_size`` parameter from ``torch.utils.data.DataLoader`` num_workers: ``num_workers`` parameter from ``torch.utils.data.DataLoader`` drop_last: ``drop_last`` parameter from ``torch.utils.data.DataLoader`` per_gpu_scaling: boolean flag, if ``True``, scales batch_size in proportion to the number of GPUs loaders_params: additional loaders parameters samplers: additional sampler parameters initial_seed: initial seed for ``torch.utils.data.DataLoader`` workers datasets: ordered dictionary with ``torch.utils.data.Dataset`` Returns: OrderedDict[str, DataLoader]: dictionary with ``torch.utils.data.DataLoader`` Raises: NotImplementedError: if datasource is out of ``Dataset`` or dict ValueError: if batch_sampler option is mutually exclusive with distributed """ from catalyst.data.sampler import DistributedSamplerWrapper default_batch_size = batch_size default_num_workers = num_workers loaders_params = copy.deepcopy(loaders_params) or {} assert isinstance(loaders_params, dict), (f"`loaders_params` should be a Dict. " f"Got: {loaders_params}") samplers = copy.deepcopy(samplers) or {} assert isinstance(samplers, dict), f"`samplers` should be a Dict. Got: {samplers}" datasets = datasets if datasets is not None else {} distributed_rank = get_rank() distributed = distributed_rank > -1 loaders = OrderedDict() for name, datasource in datasets.items(): # noqa: WPS426 assert isinstance( datasource, (Dataset, dict )), f"{datasource} should be Dataset or Dict. Got: {datasource}" loader_params = loaders_params.pop(name, {}) assert isinstance(loader_params, dict), f"{loader_params} should be Dict" sampler: Sampler = None if isinstance(datasource, dict) and "sampler" in datasource: sampler = datasource.pop("sampler", None) sampler = samplers.pop(name, sampler) batch_size = loader_params.pop("batch_size", default_batch_size) num_workers = loader_params.pop("num_workers", default_num_workers) if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus num_workers *= num_gpus elif not per_gpu_scaling and distributed: world_size = get_distributed_params().pop("world_size", 1) if batch_size % world_size == 0: batch_size = int(batch_size / world_size) else: raise ValueError( "For this distributed mode with per_gpu_scaling = False " "you need to have batch_size divisible by number of GPUs") loader_params = { "batch_size": batch_size, "num_workers": num_workers, "pin_memory": torch.cuda.is_available(), "drop_last": drop_last, **loader_params, } if isinstance(datasource, Dataset): loader_params["dataset"] = datasource elif isinstance(datasource, dict): assert "dataset" in datasource, "You need to specify dataset for dataloader" loader_params = merge_dicts(datasource, loader_params) else: raise NotImplementedError if distributed: if sampler is not None: if not isinstance(sampler, DistributedSampler): sampler = DistributedSamplerWrapper(sampler=sampler) else: sampler = DistributedSampler(dataset=loader_params["dataset"]) loader_params["shuffle"] = name.startswith("train") and sampler is None loader_params["sampler"] = sampler if "batch_sampler" in loader_params: if distributed: raise ValueError("batch_sampler option is mutually " "exclusive with distributed") for k in ("batch_size", "shuffle", "sampler", "drop_last"): loader_params.pop(k, None) if "worker_init_fn" not in loader_params: loader_params["worker_init_fn"] = partial( _worker_init_fn, initial_seed=initial_seed) loaders[name] = DataLoader(**loader_params) return loaders
def reset(self) -> None: """Reset all the statistics.""" self.statistics = defaultdict(self._mp_hack) self._is_ddp = get_rank() > -1
def process_components( model: RunnerModel, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[RunnerModel, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device. Args: model: torch model criterion: criterion function optimizer: optimizer scheduler: scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 method device (Device, optional): device Returns: tuple with processed model, criterion, optimizer, scheduler and device. Raises: ValueError: if device is None and TPU available, for using TPU need to manualy move model/optimizer/scheduler to a TPU device and pass device to a function. NotImplementedError: if model is not nn.Module or dict for multi-gpu, nn.ModuleDict for DataParallel not implemented yet """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None and IS_XLA_AVAILABLE: raise ValueError( "TPU device is available. " "Please move model, optimizer and scheduler (if present) " "to TPU device manualy and specify a device or " "use CPU device.") if device is None: device = get_device() elif isinstance(device, str): device = torch.device(device) is_apex_enabled = (distributed_params.get("apex", False) and check_apex_available()) is_amp_enabled = (distributed_params.get("amp", False) and check_amp_available()) if is_apex_enabled and is_amp_enabled: raise ValueError("Both NVidia Apex and Torch.Amp are enabled. " "You must choose only one mixed precision backend") model: Model = maybe_recursive_call(model, "to", device=device) if check_ddp_wrapped(model): pass # distributed data parallel run (ddp) (with apex support) elif get_rank() >= 0: assert isinstance( model, nn.Module), "Distributed training is not available for KV model" local_rank = distributed_params.pop("local_rank", 0) or 0 device = f"cuda:{local_rank}" model = maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) if is_apex_enabled: import apex if syncbn: model = apex.parallel.convert_syncbn_model(model) model, optimizer = initialize_apex(model, optimizer, **distributed_params) model = apex.parallel.DistributedDataParallel(model) else: if syncbn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) # data parallel run (dp) (with apex support) else: is_data_parallel = (torch.cuda.device_count() > 1 and device.type != "cpu" and device.index is None) if is_apex_enabled and not is_data_parallel: model, optimizer = initialize_apex(model, optimizer, **distributed_params) elif not is_apex_enabled and is_data_parallel: if isinstance(model, nn.Module): model = nn.DataParallel(model) elif isinstance(model, dict): model = {k: nn.DataParallel(v) for k, v in model.items()} else: raise NotImplementedError() elif is_apex_enabled and is_data_parallel: model, optimizer = _wrap_into_data_parallel_with_apex( model, optimizer, distributed_params) model: Model = maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def _prepare_inner_state( self, stage: str = SETTINGS.stage_infer_prefix, device: Device = None, model: RunnerModel = None, criterion: RunnerCriterion = None, optimizer: RunnerOptimizer = None, scheduler: RunnerScheduler = None, callbacks: Dict[str, "Callback"] = None, loaders: Dict[str, "DataLoader"] = None, logdir: str = None, num_epochs: int = 1, main_metric: str = "loss", minimize_metric: bool = True, valid_loader: str = SETTINGS.loader_valid_prefix, checkpoint_data: Dict = None, is_check_run: bool = False, verbose: bool = False, **kwargs, ): # @TODO: move/split this method to callbacks group # here should be only a small part of it # main runner components: model and device to run self.device: Device = device self.model: RunnerModel = model # experiment components, # use `catalyst.core.IExperiment` to setup them self.criterion: RunnerCriterion = criterion self.optimizer: RunnerOptimizer = optimizer self.scheduler: RunnerScheduler = scheduler # and callbacks self.callbacks: Dict[str, "Callback"] = callbacks or {} # the data self.loader = None self.loaders: OrderedDict[str, DataLoader] = loaders # and the dataflow - model input, model output self.input = None self.output = None # metrics flow - batch, loader, epoch metrics # let's use flatten storage for batch metrics # batch_metrics = {'loss': ..., 'accuracy': ..., 'iou': ...} self.batch_metrics: Dict = defaultdict(None) # just aggregated (aka mean over all batches) # batch statistics for loader # and global loader metrics, like AUC # loader_metrics = {'loss': ..., 'accuracy': ..., `auc`: ...} self.loader_metrics: Dict = defaultdict(None) # summarized metrics for different loaders # and global epoch metrics, like lr, momentum # epoch_metrics = { # 'train_loss': ..., 'train_auc': ..., 'valid_loss': ..., # 'lr': ..., 'momentum': ..., # } self.epoch_metrics: Dict = defaultdict(None) # metrics & validation self.main_metric: str = main_metric self.minimize_metric: bool = minimize_metric # validation self.valid_loader: str = valid_loader self.valid_metrics: Dict = defaultdict(None) self.is_best_valid: bool = False self.best_valid_metrics: Dict = defaultdict(None) # distributed info (@TODO: move to Engine?) self.distributed_rank: int = get_rank() self.is_distributed_master: bool = ~(self.distributed_rank > 0) self.is_distributed_worker: bool = self.distributed_rank > 0 # experiment info self.global_sample_step: int = 0 self.global_batch_step: int = 0 self.global_epoch: int = 1 self.verbose: bool = verbose self.is_check_run: bool = is_check_run self.need_early_stop: bool = False self.need_exception_reraise: bool = True # stage info self.num_epochs: int = num_epochs self.stage: str = stage self.is_infer_stage: bool = self.stage.startswith( SETTINGS.stage_infer_prefix) # epoch info self.epoch: int = 1 # loader info self.loader_sample_step: int = 0 self.loader_batch_step: int = 0 self.loader_key: str = None self.loader_len: int = 0 self.loader_batch_size = 0 self.is_train_loader: bool = False self.is_valid_loader: bool = False self.is_infer_loader: bool = True # batch info self.batch_size: int = 0 # logging self.expdir: Path = None self.logdir: Path = Path(logdir) if logdir is not None else None # extra checkpoint data for saving in checkpoint files self.checkpoint_data: Dict = checkpoint_data or {} # extra self.exception: Optional[Exception] = None # kwargs for key, value in kwargs.items(): setattr(self, key, value)
def reset(self): """Reset all statistics""" self.statistics = {} self._is_ddp = get_rank() > -1
def reset(self) -> None: """Reset confusion matrix, filling it with zeros.""" self.conf.fill(0) self._is_ddp = get_rank() > -1