Exemplo n.º 1
0
    def _get_callbacks(self, stage: str):
        callbacks = self.experiment.get_callbacks(stage)

        # distributed run setting
        rank = utils.get_rank()
        if rank == 0:  # master node
            # remove worker-only callbacks on master node
            for k in list(
                filter(
                    lambda c: callbacks[c].node == CallbackNode.Worker,
                    callbacks
                )
            ):
                del callbacks[k]
        elif rank > 0:  # worker node
            # remove master-only callbacks on worker nodes
            for k in list(
                filter(
                    lambda c: callbacks[c].node == CallbackNode.Master,
                    callbacks
                )
            ):
                del callbacks[k]

        callbacks = utils.process_callbacks(callbacks)

        return callbacks
Exemplo n.º 2
0
def validate_loaders(loaders: Dict[str, DataLoader]) -> Dict[str, DataLoader]:
    """
    Check pytorch dataloaders for distributed setup.
    Transfers them to distirbuted mode if necessary.
    (Experimental feature)

    Args:
        loaders (Dict[str, DataLoader]): dictionery with pytorch dataloaders

    Returns:
        Dict[str, DataLoader]: dictionery
            with pytorch dataloaders (with distributed samplers if necessary)
    """
    rank = get_rank()
    if rank >= 0:
        for key, value in loaders.items():
            if not isinstance(
                value.sampler, (DistributedSampler, DistributedSamplerWrapper)
            ):
                warnings.warn(
                    "With distributed training setup, "
                    "you need ``DistributedSampler`` for your ``DataLoader``."
                    "Transferring to distributed mode. (Experimental feature)"
                )
                loaders[key] = _force_make_distributed_loader(value)
    return loaders
Exemplo n.º 3
0
 def _get_logdir(self, config: Dict) -> str:
     timestamp = utils.get_utcnow_time()
     config_hash = utils.get_short_hash(config)
     logdir = f"{timestamp}.{config_hash}"
     distributed_rank = get_rank()
     if distributed_rank > -1:
         logdir = f"{logdir}.rank{distributed_rank:02d}"
     return logdir
Exemplo n.º 4
0
    def get_callbacks(self, stage: str) -> "OrderedDict[Callback]":
        """Returns the callbacks for a given stage"""
        callbacks_params = (self.stages_config[stage].get(
            "callbacks_params", {}))

        callbacks = OrderedDict()
        for key, callback_params in callbacks_params.items():
            callback = self._get_callback(**callback_params)
            callbacks[key] = callback

        # ! For compatibility with previous versions.
        default_callbacks = []
        if self._verbose:
            default_callbacks.append(("verbose", VerboseLogger))
        if not stage.startswith("infer"):
            default_callbacks.append(("_criterion", CriterionCallback))
            default_callbacks.append(("_optimizer", OptimizerCallback))
            if self.stages_config[stage].get("scheduler_params", {}):
                default_callbacks.append(("_scheduler", SchedulerCallback))
            default_callbacks.append(("_saver", CheckpointCallback))
            default_callbacks.append(("console", ConsoleLogger))
            default_callbacks.append(("tensorboard", TensorboardLogger))

        default_callbacks.append(("exception", RaiseExceptionCallback))

        for callback_name, callback_fn in default_callbacks:
            is_already_present = False
            for x in callbacks.values():
                if isinstance(x, PhaseWrapperCallback):
                    x = x.callback
                if isinstance(x, callback_fn):
                    is_already_present = True
                    break
            if not is_already_present:
                callbacks[callback_name] = callback_fn()

        # Remove LoggerCallback on worker nodes
        if get_rank() > 0:
            to_del = (LoggerCallback, ConfusionMatrixCallback)
            for k in list(
                    filter(lambda c: isinstance(callbacks[c], to_del),
                           callbacks)):
                del callbacks[k]

        return callbacks
Exemplo n.º 5
0
def main_worker(args, unknown_args):
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir))

    runner_params = config.get("runner_params", {})
    experiment = Experiment(config)
    runner = Runner(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 6
0
    def _get_callbacks(self, stage: str):
        callbacks = self.experiment.get_callbacks(stage)

        # Remove master-only callbacks on worker nodes
        if utils.get_rank() > 0:
            for k in list(
                    filter(
                        lambda c: issubclass(callbacks[c].__class__,
                                             MasterOnlyCallback), callbacks)):
                del callbacks[k]

        loggers = utils.process_callbacks(
            OrderedDict([(k, v) for k, v in callbacks.items()
                         if issubclass(v.__class__, LoggerCallback)]))
        callbacks = utils.process_callbacks(
            OrderedDict([(k, v) for k, v in callbacks.items()
                         if not issubclass(v.__class__, LoggerCallback)]))

        return callbacks, loggers
Exemplo n.º 7
0
    def on_epoch_end(self, state: _State):
        if state.stage.startswith("infer") or get_rank() > 0:
            return

        valid_metrics = dict(state.metric_manager.valid_values)
        epoch_metrics = dict(state.metric_manager.epoch_values)

        checkpoint = utils.pack_checkpoint(
            model=state.model,
            criterion=state.criterion,
            optimizer=state.optimizer,
            scheduler=state.scheduler,
            epoch_metrics=epoch_metrics,
            valid_metrics=valid_metrics,
            stage=state.stage,
            stage_epoch=state.stage_epoch_log,
            epoch=state.epoch_log,
            checkpoint_data=state.checkpoint_data)
        self.process_checkpoint(logdir=state.logdir,
                                checkpoint=checkpoint,
                                is_best=state.metric_manager.is_best,
                                main_metric=state.main_metric,
                                minimize_metric=state.minimize_metric)
Exemplo n.º 8
0
 def __init__(
     self,
     save_n_best: int = 1,
     resume: str = None,
     resume_dir: str = None,
     metric_filename: str = "_metrics.json"
 ):
     """
     Args:
         save_n_best (int): number of best checkpoint to keep
         resume (str): path to checkpoint to load
             and initialize runner state
         metric_filename (str): filename to save metrics
             in checkpoint folder. Must ends on ``.json`` or ``.yml``
     """
     super().__init__(metric_filename)
     self.save_n_best = save_n_best
     self.resume = resume
     self.resume_dir = resume_dir
     self.is_distributed_worker = utils.get_rank() > 0
     self.top_best_metrics = []
     self.epochs_metrics = []
     self._keys_from_state = ["resume", "resume_dir"]
Exemplo n.º 9
0
def main_worker(args, unknown_args):
    """@TODO: Docs. Contribution is welcome."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    experiment_fn, runner_fn = utils.import_experiment_and_runner(
        Path(args.expdir))
    if experiment_fn is None:
        experiment_params = config.get("experiment_params", {})
        experiment = experiment_params.get("experiment", "Experiment")
        experiment_fn = EXPERIMENTS.get(experiment)

    runner_params = config.get("runner_params", {})
    experiment = experiment_fn(config)
    runner = runner_fn(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 10
0
    def __init__(
        self,
        *,
        device: Device = None,
        model: StateModel = None,
        criterion: StateCriterion = None,
        optimizer: StateOptimizer = None,
        scheduler: StateScheduler = None,
        callbacks: Dict[str, "Callback"] = None,
        logdir: str = None,
        stage: str = STAGE_INFER_PREFIX,
        num_epochs: int = None,
        main_metric: str = STATE_MAIN_METRIC,
        minimize_metric: bool = True,
        valid_loader: str = LOADER_VALID_PREFIX,
        checkpoint_data: Dict = None,
        is_check_run: bool = False,
        **kwargs,
    ):
        # main part
        # data
        self.loaders: OrderedDict[str, DataLoader] = None
        # components
        self.model: StateModel = model
        self.criterion: StateCriterion = criterion
        self.optimizer: StateOptimizer = optimizer
        self.scheduler: StateScheduler = scheduler
        # extra components - PyTorch device
        self.device: Device = device
        # extra components - Catalyst callbacks
        self.callbacks: Dict[str, "Callback"] = callbacks

        # dataflow - model input, model output, metrics
        self.batch_in = None
        self.batch_out = None
        # let's use flatten storage for batch metrics
        # batch_metrics = {'loss': ..., 'accuracy': ..., 'iou': ...}
        self.batch_metrics = defaultdict(None)
        # just aggregated (aka mean over all batches)
        # batch statistics for loader
        # and global loader metrics, like AUC
        # loader_metrics = {'loss': ..., 'accuracy': ..., `auc`: ...}
        self.loader_metrics = defaultdict(None)
        # summarized metrics for different loaders
        # and global epoch metrics, like lr, momentum
        # epoch_metrics = {
        # 'train_loss': ..., 'train_auc': ..., 'valid_loss': ...,
        # 'lr': ..., 'momentum': ...,
        # }
        self.epoch_metrics = defaultdict(None)

        # validation
        self.is_best_valid = False
        self.valid_metrics = defaultdict(None)
        self.best_valid_metrics = defaultdict(None)

        # pipeline info
        self.distributed_rank = utils.get_rank()
        self.is_distributed_worker = self.distributed_rank > 0

        self.stage_name: str = stage
        self.epoch: int = 1
        self.num_epochs: int = num_epochs or np.iinfo(np.int32).max

        self.loader_name: str = None
        self.loader_step: int = 0
        self.loader_len: int = 0

        self.batch_size: int = 0

        self.global_step: int = 0
        self.global_epoch: int = 1

        # metrics & validation
        self.main_metric: str = main_metric
        self.minimize_metric: bool = minimize_metric
        self.valid_loader: str = valid_loader

        # logging
        self.logdir: Path = Path(logdir) if logdir is not None else None
        # extra checkpoint data for saving in checkpoint files
        self.checkpoint_data: Dict = checkpoint_data or {}

        # other
        self.is_check_run: bool = is_check_run
        self.is_train_loader: bool = False
        self.is_infer_stage: bool = \
            self.stage_name.startswith(STAGE_INFER_PREFIX)
        self.need_early_stop: bool = False
        self.need_exception_reraise: bool = True
        self.exception: Optional[Exception] = None

        # kwargs
        for k, v in kwargs.items():
            setattr(self, k, v)

        self._freeze()
Exemplo n.º 11
0
def get_loaders_from_params(
    batch_size: int = 1,
    num_workers: int = 0,
    drop_last: bool = False,
    per_gpu_scaling: bool = False,
    loaders_params: Dict[str, Any] = None,
    samplers_params: Dict[str, Any] = None,
    initial_seed: int = 42,
    get_datasets_fn: Callable = None,
    **data_params,
) -> "OrderedDict[str, DataLoader]":
    """
    Creates pytorch dataloaders from datasets and additional parameters.

    Args:
        batch_size (int): ``batch_size`` parameter
            from ``torch.utils.data.DataLoader``
        num_workers (int): ``num_workers`` parameter
            from ``torch.utils.data.DataLoader``
        drop_last (bool): ``drop_last`` parameter
            from ``torch.utils.data.DataLoader``
        per_gpu_scaling (bool): boolean flag,
            if ``True``, uses ``batch_size=batch_size*num_available_gpus``
        loaders_params (Dict[str, Any]): additional loaders parameters
        samplers_params (Dict[str, Any]): additional sampler parameters
        initial_seed (int): initial seed for ``torch.utils.data.DataLoader``
            workers
        get_datasets_fn(Callable): callable function to get dictionary with
            ``torch.utils.data.Datasets``
        **data_params: additional data parameters
            or dictionary with ``torch.utils.data.Datasets`` to use for
            pytorch dataloaders creation

    Returns:
        OrderedDict[str, DataLoader]: dictionary with
            ``torch.utils.data.DataLoader``

    Raises:
        NotImplementedError: if datasource is out of `Dataset` or dict
        ValueError: if batch_sampler option is mutually
            exclusive with distributed
    """
    default_batch_size = batch_size
    default_num_workers = num_workers
    loaders_params = loaders_params or {}
    assert isinstance(loaders_params,
                      dict), (f"`loaders_params` should be a Dict. "
                              f"Got: {loaders_params}")
    samplers_params = samplers_params or {}
    assert isinstance(
        samplers_params,
        dict), f"`samplers_params` should be a Dict. Got: {samplers_params}"

    distributed_rank = get_rank()
    distributed = distributed_rank > -1

    if get_datasets_fn is not None:
        datasets = get_datasets_fn(**data_params)
    else:
        datasets = dict(**data_params)

    loaders = OrderedDict()
    for name, datasource in datasets.items():  # noqa: WPS426
        assert isinstance(
            datasource,
            (Dataset, dict
             )), f"{datasource} should be Dataset or Dict. Got: {datasource}"

        loader_params = loaders_params.pop(name, {})
        assert isinstance(loader_params,
                          dict), f"{loader_params} should be Dict"

        sampler_params = samplers_params.pop(name, None)
        if sampler_params is None:
            if isinstance(datasource, dict) and "sampler" in datasource:
                sampler = datasource.pop("sampler", None)
            else:
                sampler = None
        else:
            sampler = SAMPLER.get_from_params(**sampler_params)
            if isinstance(datasource, dict) and "sampler" in datasource:
                datasource.pop("sampler", None)

        batch_size = loader_params.pop("batch_size", default_batch_size)
        num_workers = loader_params.pop("num_workers", default_num_workers)

        if per_gpu_scaling and not distributed:
            num_gpus = max(1, torch.cuda.device_count())
            batch_size *= num_gpus
            num_workers *= num_gpus

        loader_params = {
            "batch_size": batch_size,
            "num_workers": num_workers,
            "pin_memory": torch.cuda.is_available(),
            "drop_last": drop_last,
            **loader_params,
        }

        if isinstance(datasource, Dataset):
            loader_params["dataset"] = datasource
        elif isinstance(datasource, dict):
            assert (
                "dataset"
                in datasource), "You need to specify dataset for dataloader"
            loader_params = merge_dicts(datasource, loader_params)
        else:
            raise NotImplementedError

        if distributed:
            if sampler is not None:
                if not isinstance(sampler, DistributedSampler):
                    sampler = DistributedSamplerWrapper(sampler=sampler)
            else:
                sampler = DistributedSampler(dataset=loader_params["dataset"])

        loader_params["shuffle"] = name.startswith("train") and sampler is None

        loader_params["sampler"] = sampler

        if "batch_sampler" in loader_params:
            if distributed:
                raise ValueError("batch_sampler option is mutually "
                                 "exclusive with distributed")

            for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                loader_params.pop(k, None)

        if "worker_init_fn" not in loader_params:
            loader_params["worker_init_fn"] = lambda x: set_global_seed(
                initial_seed + x)

        loaders[name] = DataLoader(**loader_params)

    return loaders
Exemplo n.º 12
0
    def get_loaders(
        self,
        stage: str,
        epoch: int = None,
    ) -> "OrderedDict[str, DataLoader]":
        """Returns the loaders for a given stage"""
        data_params = dict(self.stages_config[stage]["data_params"])

        batch_size = data_params.pop("batch_size", 1)
        num_workers = data_params.pop("num_workers")
        drop_last = data_params.pop("drop_last", False)
        per_gpu_scaling = data_params.pop("per_gpu_scaling", False)
        distributed_rank = get_rank()
        distributed = distributed_rank > -1

        datasets = self.get_datasets(stage=stage, **data_params)

        overridden_loaders_params = data_params.pop("loaders_params", {})
        assert isinstance(
            overridden_loaders_params,
            dict), (f"`overridden_loaders_params` should be a Dict. "
                    f"Got: {overridden_loaders_params}")

        samplers_params = data_params.pop("samplers_params", {})
        assert isinstance(samplers_params, dict), \
            f"`samplers_params` should be a Dict. Got: {samplers_params}"

        loaders = OrderedDict()
        for name, ds_ in datasets.items():
            assert isinstance(ds_, (Dataset, dict)), \
                f"{ds_} should be Dataset or Dict"

            overridden_loader_params = overridden_loaders_params.pop(name, {})
            assert isinstance(overridden_loader_params, dict), \
                f"{overridden_loader_params} should be Dict"

            sampler_params = samplers_params.pop(name, None)
            if sampler_params is None:
                if isinstance(ds_, dict) and "sampler" in ds_:
                    sampler = ds_.pop("sampler", None)
                else:
                    sampler = None
            else:
                sampler = SAMPLERS.get_from_params(**sampler_params)
                if isinstance(ds_, dict) and "sampler" in ds_:
                    ds_.pop("sampler", None)

            batch_size = overridden_loader_params.pop("batch_size", batch_size)
            num_workers = overridden_loader_params.\
                pop("num_workers", num_workers)

            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus
                num_workers *= num_gpus

            loader_params = {
                "batch_size": batch_size,
                "num_workers": num_workers,
                "pin_memory": torch.cuda.is_available(),
                "drop_last": drop_last,
                **overridden_loader_params
            }

            if isinstance(ds_, Dataset):
                loader_params["dataset"] = ds_
            elif isinstance(ds_, dict):
                assert "dataset" in ds_, \
                    "You need to specify dataset for dataloader"
                loader_params = utils.merge_dicts(ds_, loader_params)
            else:
                raise NotImplementedError

            if distributed:
                if sampler is not None:
                    if not isinstance(sampler, DistributedSampler):
                        loader_params["sampler"] = \
                            DistributedSamplerWrapper(sampler=sampler)
                else:
                    sampler = DistributedSampler(
                        dataset=loader_params["dataset"])

            loader_params["shuffle"] = (name.startswith("train")
                                        and sampler is None)

            loader_params["sampler"] = sampler

            if "batch_sampler" in loader_params:
                if distributed:
                    raise ValueError("batch_sampler option is mutually "
                                     "exclusive with distributed")

                for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                    loader_params.pop(k, None)

            if "worker_init_fn" not in loader_params:
                loader_params["worker_init_fn"] = \
                    lambda x: utils.set_global_seed(self.initial_seed + x)

            loaders[name] = DataLoader(**loader_params)

        return loaders
Exemplo n.º 13
0
    def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]],
                       **params) -> Optimizer:
        # @TODO 1: refactoring; this method is too long
        # @TODO 2: load state dicts for schedulers & criterion
        layerwise_params = \
            params.pop("layerwise_params", OrderedDict())
        no_bias_weight_decay = \
            params.pop("no_bias_weight_decay", True)

        # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf
        lr_scaling_params = params.pop("lr_linear_scaling", None)
        if lr_scaling_params:
            data_params = dict(self.stages_config[stage]["data_params"])
            batch_size = data_params.get("batch_size")
            per_gpu_scaling = data_params.get("per_gpu_scaling", False)
            distributed_rank = get_rank()
            distributed = distributed_rank > -1
            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus

            base_lr = lr_scaling_params.get("lr")
            base_batch_size = lr_scaling_params.get("base_batch_size", 256)
            lr_scaling = batch_size / base_batch_size
            params["lr"] = base_lr * lr_scaling  # scale default lr
        else:
            lr_scaling = 1.0

        # getting model parameters
        model_key = params.pop("_model", None)
        if model_key is None:
            assert isinstance(model, nn.Module), \
                "model is keyvalue, but optimizer has no specified model"
            model_params = utils.process_model_params(model, layerwise_params,
                                                      no_bias_weight_decay,
                                                      lr_scaling)
        elif isinstance(model_key, str):
            model_params = utils.process_model_params(model[model_key],
                                                      layerwise_params,
                                                      no_bias_weight_decay,
                                                      lr_scaling)
        elif isinstance(model_key, (list, tuple)):
            model_params = []
            for model_key_ in model_key:
                model_params_ = utils.process_model_params(
                    model[model_key_], layerwise_params, no_bias_weight_decay,
                    lr_scaling)
                model_params.extend(model_params_)
        else:
            raise ValueError("unknown type of model_params")

        load_from_previous_stage = \
            params.pop("load_from_previous_stage", False)
        optimizer_key = params.pop("optimizer_key", None)
        optimizer = OPTIMIZERS.get_from_params(**params, params=model_params)

        if load_from_previous_stage and self.stages.index(stage) != 0:
            checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth"
            checkpoint = utils.load_checkpoint(checkpoint_path)

            dict2load = optimizer
            if optimizer_key is not None:
                dict2load = {optimizer_key: optimizer}
            utils.unpack_checkpoint(checkpoint, optimizer=dict2load)

            # move optimizer to device
            device = utils.get_device()
            for param in model_params:
                param = param["params"][0]
                state = optimizer.state[param]
                for key, value in state.items():
                    state[key] = utils.any2device(value, device)

            # update optimizer params
            for key, value in params.items():
                for pg in optimizer.param_groups:
                    pg[key] = value

        return optimizer