Exemplo n.º 1
0
def filter_callbacks_by_node(
        callbacks: Union[Dict, OrderedDict]) -> Union[Dict, OrderedDict]:
    """
    Filters callbacks based on running node.
    Deletes worker-only callbacks from ``CallbackNode.Master``
    and master-only callbacks from ``CallbackNode.Worker``.

    Args:
        callbacks (Union[Dict, OrderedDict]): callbacks

    Returns:
        Union[Dict, OrderedDict]: filtered callbacks dictionary.
    """
    # distributed run setting
    output = callbacks.copy()
    rank = get_rank()
    if rank == 0:  # master node
        # remove worker-only callbacks on master node
        for k in list(
                filter(lambda c: output[c].node == CallbackNode.worker,
                       output)):
            del output[k]
    elif rank > 0:  # worker node
        # remove master-only callbacks on worker nodes
        for k in list(
                filter(lambda c: output[c].node == CallbackNode.master,
                       output)):
            del output[k]
    return output
Exemplo n.º 2
0
def do_lr_linear_scaling(lr_scaling_params, batch_size: int,
                         per_gpu_scaling: bool) -> Tuple[float, float]:
    """
    Linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf

    Args:
        lr_scaling_params: config parameters of lr linear scaling
        batch_size: batch size
        per_gpu_scaling: per-gpu-scaling flag

    Returns:
        lr, lr_scaling

    """
    distributed_rank = get_rank()
    distributed = distributed_rank > -1
    if per_gpu_scaling and not distributed:
        num_gpus = max(1, torch.cuda.device_count())
        batch_size *= num_gpus

    base_lr = lr_scaling_params.get("lr")
    base_batch_size = lr_scaling_params.get("base_batch_size", 256)
    lr_scaling = batch_size / base_batch_size
    lr = base_lr * lr_scaling  # scale default lr
    return lr, lr_scaling
Exemplo n.º 3
0
def validate_loaders(loaders: Dict[str, DataLoader]) -> Dict[str, DataLoader]:
    """
    Check pytorch dataloaders for distributed setup.
    Transfers them to distirbuted mode if necessary.
    (Experimental feature)

    Args:
        loaders (Dict[str, DataLoader]): dictionery with pytorch dataloaders

    Returns:
        Dict[str, DataLoader]: dictionery
            with pytorch dataloaders (with distributed samplers if necessary)
    """
    from catalyst.data.sampler import DistributedSamplerWrapper

    rank = get_rank()
    if rank >= 0:
        for key, value in loaders.items():
            if not isinstance(value.sampler,
                              (DistributedSampler, DistributedSamplerWrapper)):
                warnings.warn(
                    "With distributed training setup, "
                    "you need ``DistributedSampler`` for your ``DataLoader``.")
                # loaders[key] = _force_make_distributed_loader(value)
    return loaders
Exemplo n.º 4
0
    def reset(self, num_batches: int, num_samples: int) -> None:
        """
        Reset metrics fields

        Args:
            num_batches: expected number of batches
            num_samples: expected number of samples to accumulate
        """
        super().reset(num_batches, num_samples)
        assert get_rank() < 0, "No DDP support implemented yet"
Exemplo n.º 5
0
 def __init__(self,
              compute_on_call: bool = True,
              prefix: str = None,
              suffix: str = None):
     """Init."""
     super().__init__(compute_on_call=compute_on_call,
                      prefix=prefix,
                      suffix=suffix)
     self.metric_name = f"{self.prefix}auc{self.suffix}"
     self.scores = []
     self.targets = []
     self._is_ddp = get_rank() > -1
Exemplo n.º 6
0
def config_main(args, unknown_args):
    """Yaml config catalyst-dl run entry point."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    runner: ConfigRunner = get_config_runner(expdir=args.expdir, config=config)

    if get_rank() <= 0:
        dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs)
        dump_code(expdir=args.expdir, logdir=runner.logdir)

    runner.run()
Exemplo n.º 7
0
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        runner: ConfigRunner = get_config_runner(expdir=Path(args.expdir), config=trial_config)
        # @TODO: here we need better solution.
        runner._trial = trial  # noqa: WPS437

        if get_rank() <= 0:
            dump_environment(logdir=runner.logdir, config=config, configs_path=args.configs)
            dump_code(expdir=args.expdir, logdir=runner.logdir)

        runner.run()

        return trial.best_score
Exemplo n.º 8
0
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        experiment, runner, trial_config = prepare_config_api_components(
            expdir=expdir, config=trial_config)
        # @TODO: here we need better solution.
        experiment._trial = trial  # noqa: WPS437

        if experiment.logdir is not None and get_rank() <= 0:
            dump_environment(trial_config, experiment.logdir, args.configs)
            dump_code(args.expdir, experiment.logdir)

        runner.run_experiment(experiment)

        return runner.best_valid_metrics[runner.main_metric]
Exemplo n.º 9
0
def main_worker(cfg: DictConfig):
    set_global_seed(cfg.args.seed)
    prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark)

    import_module(hydra.utils.to_absolute_path(cfg.args.expdir))

    experiment = hydra.utils.instantiate(cfg.experiment, cfg=cfg)
    runner = hydra.utils.instantiate(cfg.runner)

    if experiment.logdir is not None and get_rank() <= 0:
        dump_environment(cfg, experiment.logdir)
        dump_code(
            hydra.utils.to_absolute_path(cfg.args.expdir), experiment.logdir
        )

    runner.run_experiment(experiment)
Exemplo n.º 10
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp

    experiment, runner, config = prepare_config_api_components(expdir=Path(
        args.expdir),
                                                               config=config)

    if experiment.logdir is not None and get_rank() <= 0:
        dump_environment(config, experiment.logdir, args.configs)
        dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 11
0
def main(cfg: DictConfig):
    """
    Hydra config catalyst-dl run entry point

    Args:
        cfg: (DictConfig) configuration

    """
    cfg = prepare_hydra_config(cfg)
    set_global_seed(cfg.args.seed)
    prepare_cudnn(cfg.args.deterministic, cfg.args.benchmark)

    import_module(hydra.utils.to_absolute_path(cfg.args.expdir))
    runner = hydra.utils.instantiate(cfg.runner, cfg=cfg)

    if get_rank() <= 0:
        dump_environment(logdir=runner.logdir, config=cfg)
        dump_code(expdir=hydra.utils.to_absolute_path(cfg.args.expdir),
                  logdir=runner.logdir)

    runner.run()
Exemplo n.º 12
0
 def _get_loader(
     dataset: Dataset,
     sampler: Sampler,
     initial_seed: int,
     params: DictConfig,
 ) -> DataLoader:
     params = OmegaConf.to_container(params, resolve=True)
     per_gpu_scaling = params.pop("per_gpu_scaling", False)
     params["dataset"] = dataset
     distributed_rank = get_rank()
     distributed = distributed_rank > -1
     if per_gpu_scaling and not distributed:
         num_gpus = max(1, torch.cuda.device_count())
         assert ("batch_size"
                 in params), "loader config must contain 'batch_size' key"
         assert ("num_workers"
                 in params), "loader config must contain 'num_workers' key"
         params["batch_size"] *= num_gpus
         params["num_workers"] *= num_gpus
     if distributed:
         if sampler is not None:
             if not isinstance(sampler, DistributedSampler):
                 sampler = DistributedSamplerWrapper(sampler=sampler)
         else:
             sampler = DistributedSampler(dataset=params["dataset"])
     params["shuffle"] = params.get("shuffle", False) and sampler is None
     params["sampler"] = sampler
     worker_init_fn = params.pop("worker_init_fn", None)
     if worker_init_fn is None:
         params["worker_init_fn"] = lambda x: set_global_seed(initial_seed +
                                                              x)
     else:
         params["worker_init_fn"] = hydra.utils.get_method(worker_init_fn)
     collate_fn = params.pop("collate_fn", None)
     if collate_fn is None:
         params["collate_fn"] = None
     else:
         params["collate_fn"] = hydra.utils.get_method(collate_fn)
     loader: DataLoader = DataLoader(**params)
     return loader
    def on_loader_end(self, runner: IRunner):
        eps = 1e-7
        ious_per_image = []

        # Gather statistics from all nodes
        all_gathered_scores_per_image = all_gather(self.scores_per_image)

        n = len(self.thresholds)
        all_scores_per_image = defaultdict(lambda: {
            "intersection": np.zeros(n),
            "union": np.zeros(n)
        })
        for scores_per_image in all_gathered_scores_per_image:
            for image_id, values in scores_per_image.items():
                all_scores_per_image[image_id]["intersection"] += values[
                    "intersection"]
                all_scores_per_image[image_id]["union"] += values["union"]

        for image_id, values in all_scores_per_image.items():
            intersection = values["intersection"]
            union = values["union"]
            metric = intersection / (union + eps)
            ious_per_image.append(metric)

        thresholds = to_numpy(self.thresholds)
        iou = np.mean(ious_per_image, axis=0)
        assert len(iou) == len(thresholds)

        threshold_index = np.argmax(iou)
        iou_at_threshold = iou[threshold_index]
        threshold_value = thresholds[threshold_index]

        runner.loader_metrics[self.prefix + "/" +
                              "threshold"] = float(threshold_value)
        runner.loader_metrics[self.prefix] = float(iou_at_threshold)

        if get_rank() in {-1, 0}:
            logger = get_tensorboard_logger(runner)
            logger.add_histogram(self.prefix, iou, global_step=runner.epoch)
Exemplo n.º 14
0
def process_components(
    model: Model,
    criterion: Criterion = None,
    optimizer: Optimizer = None,
    scheduler: Scheduler = None,
    distributed_params: Dict = None,
    device: Device = None,
) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]:
    """
    Returns the processed model, criterion, optimizer, scheduler and device.

    Args:
        model (Model): torch model
        criterion (Criterion): criterion function
        optimizer (Optimizer): optimizer
        scheduler (Scheduler): scheduler
        distributed_params (dict, optional): dict with the parameters
            for distributed and FP16 method
        device (Device, optional): device

    Returns:
        tuple with processed model, criterion, optimizer, scheduler and device.

    Raises:
        NotImplementedError: if model is not nn.Module or dict for multi-gpu,
            nn.ModuleDict for DataParallel not implemented yet
    """
    distributed_params = distributed_params or {}
    distributed_params = copy.deepcopy(distributed_params)
    distributed_params.update(get_distributed_params())

    if device is None:
        device = get_device()
    elif isinstance(device, str):
        device = torch.device(device)

    is_apex_available = (distributed_params.pop("apex", True)
                         and check_apex_available())

    model: Model = maybe_recursive_call(model, "to", device=device)

    if check_ddp_wrapped(model):
        pass
    # distributed data parallel run (ddp) (with apex support)
    elif get_rank() >= 0:
        assert isinstance(
            model,
            nn.Module), "Distributed training is not available for KV model"

        local_rank = distributed_params.pop("local_rank", 0) or 0
        device = f"cuda:{local_rank}"
        model = maybe_recursive_call(model, "to", device=device)

        syncbn = distributed_params.pop("syncbn", False)

        if is_apex_available:
            import apex

            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)
            model = apex.parallel.DistributedDataParallel(model)

            if syncbn:
                model = apex.parallel.convert_syncbn_model(model)
        else:
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)
    # data parallel run (dp) (with apex support)
    else:
        # apex issue https://github.com/deepset-ai/FARM/issues/210
        use_apex = (is_apex_available and torch.cuda.device_count() == 1) or (
            is_apex_available and torch.cuda.device_count() > 1
            and distributed_params.get("opt_level", "O0") == "O1")

        if use_apex:
            assert isinstance(
                model,
                nn.Module), "Apex training is not available for KV model"

            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)

        if (torch.cuda.device_count() > 1 and device.type != "cpu"
                and device.index is None):
            if isinstance(model, nn.Module):
                model = nn.DataParallel(model)
            elif isinstance(model, dict):
                model = {k: nn.DataParallel(v) for k, v in model.items()}
            else:
                raise NotImplementedError()

    model: Model = maybe_recursive_call(model, "to", device=device)

    return model, criterion, optimizer, scheduler, device
Exemplo n.º 15
0
def get_loaders_from_params(
    batch_size: int = 1,
    num_workers: int = 0,
    drop_last: bool = False,
    per_gpu_scaling: bool = False,
    loaders_params: Dict[str, Any] = None,
    samplers_params: Dict[str, Any] = None,
    initial_seed: int = 42,
    get_datasets_fn: Callable = None,
    **data_params,
) -> "OrderedDict[str, DataLoader]":
    """
    Creates pytorch dataloaders from datasets and additional parameters.

    Args:
        batch_size: ``batch_size`` parameter
            from ``torch.utils.data.DataLoader``
        num_workers: ``num_workers`` parameter
            from ``torch.utils.data.DataLoader``
        drop_last: ``drop_last`` parameter
            from ``torch.utils.data.DataLoader``
        per_gpu_scaling: boolean flag,
            if ``True``, uses ``batch_size=batch_size*num_available_gpus``
        loaders_params (Dict[str, Any]): additional loaders parameters
        samplers_params (Dict[str, Any]): additional sampler parameters
        initial_seed: initial seed for ``torch.utils.data.DataLoader``
            workers
        get_datasets_fn(Callable): callable function to get dictionary with
            ``torch.utils.data.Datasets``
        **data_params: additional data parameters
            or dictionary with ``torch.utils.data.Datasets`` to use for
            pytorch dataloaders creation

    Returns:
        OrderedDict[str, DataLoader]: dictionary with
            ``torch.utils.data.DataLoader``

    Raises:
        NotImplementedError: if datasource is out of `Dataset` or dict
        ValueError: if batch_sampler option is mutually
            exclusive with distributed
    """
    from catalyst.data.sampler import DistributedSamplerWrapper

    default_batch_size = batch_size
    default_num_workers = num_workers
    loaders_params = loaders_params or {}
    assert isinstance(
        loaders_params,
        dict), f"`loaders_params` should be a Dict. " f"Got: {loaders_params}"
    samplers_params = samplers_params or {}
    assert isinstance(
        samplers_params,
        dict), f"`samplers_params` should be a Dict. Got: {samplers_params}"

    distributed_rank = get_rank()
    distributed = distributed_rank > -1

    if get_datasets_fn is not None:
        datasets = get_datasets_fn(**data_params)
    else:
        datasets = dict(**data_params)

    loaders = OrderedDict()
    for name, datasource in datasets.items():  # noqa: WPS426
        assert isinstance(
            datasource,
            (Dataset, dict
             )), f"{datasource} should be Dataset or Dict. Got: {datasource}"

        loader_params = loaders_params.pop(name, {})
        assert isinstance(loader_params,
                          dict), f"{loader_params} should be Dict"

        sampler_params = samplers_params.pop(name, None)
        if sampler_params is None:
            if isinstance(datasource, dict) and "sampler" in datasource:
                sampler = datasource.pop("sampler", None)
            else:
                sampler = None
        else:
            sampler = SAMPLER.get_from_params(**sampler_params)
            if isinstance(datasource, dict) and "sampler" in datasource:
                datasource.pop("sampler", None)

        batch_size = loader_params.pop("batch_size", default_batch_size)
        num_workers = loader_params.pop("num_workers", default_num_workers)

        if per_gpu_scaling and not distributed:
            num_gpus = max(1, torch.cuda.device_count())
            batch_size *= num_gpus
            num_workers *= num_gpus

        loader_params = {
            "batch_size": batch_size,
            "num_workers": num_workers,
            "pin_memory": torch.cuda.is_available(),
            "drop_last": drop_last,
            **loader_params,
        }

        if isinstance(datasource, Dataset):
            loader_params["dataset"] = datasource
        elif isinstance(datasource, dict):
            assert "dataset" in datasource, "You need to specify dataset for dataloader"
            loader_params = merge_dicts(datasource, loader_params)
        else:
            raise NotImplementedError

        if distributed:
            if sampler is not None:
                if not isinstance(sampler, DistributedSampler):
                    sampler = DistributedSamplerWrapper(sampler=sampler)
            else:
                sampler = DistributedSampler(dataset=loader_params["dataset"])

        loader_params["shuffle"] = name.startswith("train") and sampler is None

        loader_params["sampler"] = sampler

        if "batch_sampler" in loader_params:
            if distributed:
                raise ValueError("batch_sampler option is mutually "
                                 "exclusive with distributed")

            for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                loader_params.pop(k, None)

        if "worker_init_fn" not in loader_params:
            loader_params["worker_init_fn"] = lambda x: set_global_seed(
                initial_seed + x)

        loaders[name] = DataLoader(**loader_params)

    return loaders
Exemplo n.º 16
0
    def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]],
                       **params) -> Optimizer:
        # @TODO 1: refactoring; this method is too long
        # @TODO 2: load state dicts for schedulers & criterion
        layerwise_params = params.pop("layerwise_params", OrderedDict())
        no_bias_weight_decay = params.pop("no_bias_weight_decay", True)

        # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf
        lr_scaling_params = params.pop("lr_linear_scaling", None)
        if lr_scaling_params:
            data_params = dict(self.stages_config[stage]["data_params"])
            batch_size = data_params.get("batch_size")
            per_gpu_scaling = data_params.get("per_gpu_scaling", False)
            distributed_rank = get_rank()
            distributed = distributed_rank > -1
            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus

            base_lr = lr_scaling_params.get("lr")
            base_batch_size = lr_scaling_params.get("base_batch_size", 256)
            lr_scaling = batch_size / base_batch_size
            params["lr"] = base_lr * lr_scaling  # scale default lr
        else:
            lr_scaling = 1.0

        # getting model parameters
        model_key = params.pop("_model", None)
        if model_key is None:
            assert isinstance(
                model, nn.Module
            ), "model is key-value, but optimizer has no specified model"
            model_params = process_model_params(model, layerwise_params,
                                                no_bias_weight_decay,
                                                lr_scaling)
        elif isinstance(model_key, str):
            model_params = process_model_params(
                model[model_key],
                layerwise_params,
                no_bias_weight_decay,
                lr_scaling,
            )
        elif isinstance(model_key, (list, tuple)):
            model_params = []
            for model_key_el in model_key:
                model_params_el = process_model_params(
                    model[model_key_el],
                    layerwise_params,
                    no_bias_weight_decay,
                    lr_scaling,
                )
                model_params.extend(model_params_el)
        else:
            raise ValueError("unknown type of model_params")

        load_from_previous_stage = params.pop("load_from_previous_stage",
                                              False)
        optimizer_key = params.pop("optimizer_key", None)
        optimizer = OPTIMIZERS.get_from_params(**params, params=model_params)

        if load_from_previous_stage and self.stages.index(stage) != 0:
            checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth"
            checkpoint = load_checkpoint(checkpoint_path)

            dict2load = optimizer
            if optimizer_key is not None:
                dict2load = {optimizer_key: optimizer}
            unpack_checkpoint(checkpoint, optimizer=dict2load)

            # move optimizer to device
            device = get_device()
            for param in model_params:
                param = param["params"][0]
                optimizer_state = optimizer.state[param]
                for state_key, state_value in optimizer_state.items():
                    optimizer_state[state_key] = any2device(
                        state_value, device)

            # update optimizer params
            for key, value in params.items():
                for optimizer_param_group in optimizer.param_groups:
                    optimizer_param_group[key] = value

        return optimizer
Exemplo n.º 17
0
 def reset(self, num_batches, num_samples) -> None:
     """Resets all fields"""
     self._is_ddp = get_rank() > -1
     self.scores = []
     self.targets = []
Exemplo n.º 18
0
def get_loaders_from_params(
    batch_size: int = 1,
    num_workers: int = 0,
    drop_last: bool = False,
    per_gpu_scaling: bool = False,
    loaders_params: Dict[str, Any] = None,
    samplers: "OrderedDict[str, Sampler]" = None,
    datasets: "OrderedDict[str, Union[Dataset, dict]]" = None,
    initial_seed: int = 42,
) -> "OrderedDict[str, DataLoader]":
    """
    Creates pytorch dataloaders from datasets and additional parameters.

    Args:
        batch_size: ``batch_size`` parameter
            from ``torch.utils.data.DataLoader``
        num_workers: ``num_workers`` parameter
            from ``torch.utils.data.DataLoader``
        drop_last: ``drop_last`` parameter
            from ``torch.utils.data.DataLoader``
        per_gpu_scaling: boolean flag,
            if ``True``, scales batch_size in proportion to the number of GPUs
        loaders_params: additional loaders parameters
        samplers: additional sampler parameters
        initial_seed: initial seed for ``torch.utils.data.DataLoader``
            workers
        datasets: ordered dictionary with ``torch.utils.data.Dataset``

    Returns:
        OrderedDict[str, DataLoader]: dictionary with
            ``torch.utils.data.DataLoader``

    Raises:
        NotImplementedError: if datasource is out of ``Dataset`` or dict
        ValueError: if batch_sampler option is mutually
            exclusive with distributed
    """
    from catalyst.data.sampler import DistributedSamplerWrapper

    default_batch_size = batch_size
    default_num_workers = num_workers
    loaders_params = copy.deepcopy(loaders_params) or {}
    assert isinstance(loaders_params,
                      dict), (f"`loaders_params` should be a Dict. "
                              f"Got: {loaders_params}")
    samplers = copy.deepcopy(samplers) or {}
    assert isinstance(samplers,
                      dict), f"`samplers` should be a Dict. Got: {samplers}"
    datasets = datasets if datasets is not None else {}

    distributed_rank = get_rank()
    distributed = distributed_rank > -1

    loaders = OrderedDict()
    for name, datasource in datasets.items():  # noqa: WPS426
        assert isinstance(
            datasource,
            (Dataset, dict
             )), f"{datasource} should be Dataset or Dict. Got: {datasource}"

        loader_params = loaders_params.pop(name, {})
        assert isinstance(loader_params,
                          dict), f"{loader_params} should be Dict"

        sampler: Sampler = None
        if isinstance(datasource, dict) and "sampler" in datasource:
            sampler = datasource.pop("sampler", None)
        sampler = samplers.pop(name, sampler)

        batch_size = loader_params.pop("batch_size", default_batch_size)
        num_workers = loader_params.pop("num_workers", default_num_workers)

        if per_gpu_scaling and not distributed:
            num_gpus = max(1, torch.cuda.device_count())
            batch_size *= num_gpus
            num_workers *= num_gpus
        elif not per_gpu_scaling and distributed:
            world_size = get_distributed_params().pop("world_size", 1)
            if batch_size % world_size == 0:
                batch_size = int(batch_size / world_size)
            else:
                raise ValueError(
                    "For this distributed mode with per_gpu_scaling = False "
                    "you need to have batch_size divisible by number of GPUs")

        loader_params = {
            "batch_size": batch_size,
            "num_workers": num_workers,
            "pin_memory": torch.cuda.is_available(),
            "drop_last": drop_last,
            **loader_params,
        }

        if isinstance(datasource, Dataset):
            loader_params["dataset"] = datasource
        elif isinstance(datasource, dict):
            assert "dataset" in datasource, "You need to specify dataset for dataloader"
            loader_params = merge_dicts(datasource, loader_params)
        else:
            raise NotImplementedError

        if distributed:
            if sampler is not None:
                if not isinstance(sampler, DistributedSampler):
                    sampler = DistributedSamplerWrapper(sampler=sampler)
            else:
                sampler = DistributedSampler(dataset=loader_params["dataset"])

        loader_params["shuffle"] = name.startswith("train") and sampler is None

        loader_params["sampler"] = sampler

        if "batch_sampler" in loader_params:
            if distributed:
                raise ValueError("batch_sampler option is mutually "
                                 "exclusive with distributed")

            for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                loader_params.pop(k, None)

        if "worker_init_fn" not in loader_params:
            loader_params["worker_init_fn"] = partial(
                _worker_init_fn, initial_seed=initial_seed)

        loaders[name] = DataLoader(**loader_params)

    return loaders
Exemplo n.º 19
0
 def reset(self) -> None:
     """Reset all the statistics."""
     self.statistics = defaultdict(self._mp_hack)
     self._is_ddp = get_rank() > -1
Exemplo n.º 20
0
def process_components(
    model: RunnerModel,
    criterion: Criterion = None,
    optimizer: Optimizer = None,
    scheduler: Scheduler = None,
    distributed_params: Dict = None,
    device: Device = None,
) -> Tuple[RunnerModel, Criterion, Optimizer, Scheduler, Device]:
    """
    Returns the processed model, criterion, optimizer, scheduler and device.

    Args:
        model: torch model
        criterion: criterion function
        optimizer: optimizer
        scheduler: scheduler
        distributed_params (dict, optional): dict with the parameters
            for distributed and FP16 method
        device (Device, optional): device

    Returns:
        tuple with processed model, criterion, optimizer, scheduler and device.

    Raises:
        ValueError: if device is None and TPU available,
            for using TPU need to manualy move model/optimizer/scheduler
            to a TPU device and pass device to a function.
        NotImplementedError: if model is not nn.Module or dict for multi-gpu,
            nn.ModuleDict for DataParallel not implemented yet
    """
    distributed_params = distributed_params or {}
    distributed_params = copy.deepcopy(distributed_params)
    distributed_params.update(get_distributed_params())

    if device is None and IS_XLA_AVAILABLE:
        raise ValueError(
            "TPU device is available. "
            "Please move model, optimizer and scheduler (if present) "
            "to TPU device manualy and specify a device or "
            "use CPU device.")

    if device is None:
        device = get_device()
    elif isinstance(device, str):
        device = torch.device(device)

    is_apex_enabled = (distributed_params.get("apex", False)
                       and check_apex_available())

    is_amp_enabled = (distributed_params.get("amp", False)
                      and check_amp_available())

    if is_apex_enabled and is_amp_enabled:
        raise ValueError("Both NVidia Apex and Torch.Amp are enabled. "
                         "You must choose only one mixed precision backend")
    model: Model = maybe_recursive_call(model, "to", device=device)

    if check_ddp_wrapped(model):
        pass
    # distributed data parallel run (ddp) (with apex support)
    elif get_rank() >= 0:
        assert isinstance(
            model,
            nn.Module), "Distributed training is not available for KV model"

        local_rank = distributed_params.pop("local_rank", 0) or 0
        device = f"cuda:{local_rank}"
        model = maybe_recursive_call(model, "to", device=device)

        syncbn = distributed_params.pop("syncbn", False)

        if is_apex_enabled:
            import apex

            if syncbn:
                model = apex.parallel.convert_syncbn_model(model)

            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)
            model = apex.parallel.DistributedDataParallel(model)
        else:
            if syncbn:
                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)
    # data parallel run (dp) (with apex support)
    else:
        is_data_parallel = (torch.cuda.device_count() > 1
                            and device.type != "cpu" and device.index is None)

        if is_apex_enabled and not is_data_parallel:
            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)

        elif not is_apex_enabled and is_data_parallel:
            if isinstance(model, nn.Module):
                model = nn.DataParallel(model)
            elif isinstance(model, dict):
                model = {k: nn.DataParallel(v) for k, v in model.items()}
            else:
                raise NotImplementedError()

        elif is_apex_enabled and is_data_parallel:
            model, optimizer = _wrap_into_data_parallel_with_apex(
                model, optimizer, distributed_params)

    model: Model = maybe_recursive_call(model, "to", device=device)

    return model, criterion, optimizer, scheduler, device
Exemplo n.º 21
0
    def _prepare_inner_state(
        self,
        stage: str = SETTINGS.stage_infer_prefix,
        device: Device = None,
        model: RunnerModel = None,
        criterion: RunnerCriterion = None,
        optimizer: RunnerOptimizer = None,
        scheduler: RunnerScheduler = None,
        callbacks: Dict[str, "Callback"] = None,
        loaders: Dict[str, "DataLoader"] = None,
        logdir: str = None,
        num_epochs: int = 1,
        main_metric: str = "loss",
        minimize_metric: bool = True,
        valid_loader: str = SETTINGS.loader_valid_prefix,
        checkpoint_data: Dict = None,
        is_check_run: bool = False,
        verbose: bool = False,
        **kwargs,
    ):
        # @TODO: move/split this method to callbacks group
        # here should be only a small part of it
        # main runner components: model and device to run
        self.device: Device = device
        self.model: RunnerModel = model

        # experiment components,
        # use `catalyst.core.IExperiment` to setup them
        self.criterion: RunnerCriterion = criterion
        self.optimizer: RunnerOptimizer = optimizer
        self.scheduler: RunnerScheduler = scheduler
        # and callbacks
        self.callbacks: Dict[str, "Callback"] = callbacks or {}

        # the data
        self.loader = None
        self.loaders: OrderedDict[str, DataLoader] = loaders
        # and the dataflow - model input, model output
        self.input = None
        self.output = None

        # metrics flow - batch, loader, epoch metrics
        # let's use flatten storage for batch metrics
        # batch_metrics = {'loss': ..., 'accuracy': ..., 'iou': ...}
        self.batch_metrics: Dict = defaultdict(None)
        # just aggregated (aka mean over all batches)
        # batch statistics for loader
        # and global loader metrics, like AUC
        # loader_metrics = {'loss': ..., 'accuracy': ..., `auc`: ...}
        self.loader_metrics: Dict = defaultdict(None)
        # summarized metrics for different loaders
        # and global epoch metrics, like lr, momentum
        # epoch_metrics = {
        # 'train_loss': ..., 'train_auc': ..., 'valid_loss': ...,
        # 'lr': ..., 'momentum': ...,
        # }
        self.epoch_metrics: Dict = defaultdict(None)

        # metrics & validation
        self.main_metric: str = main_metric
        self.minimize_metric: bool = minimize_metric

        # validation
        self.valid_loader: str = valid_loader
        self.valid_metrics: Dict = defaultdict(None)
        self.is_best_valid: bool = False
        self.best_valid_metrics: Dict = defaultdict(None)

        # distributed info (@TODO: move to Engine?)
        self.distributed_rank: int = get_rank()
        self.is_distributed_master: bool = ~(self.distributed_rank > 0)
        self.is_distributed_worker: bool = self.distributed_rank > 0
        # experiment info
        self.global_sample_step: int = 0
        self.global_batch_step: int = 0
        self.global_epoch: int = 1
        self.verbose: bool = verbose
        self.is_check_run: bool = is_check_run
        self.need_early_stop: bool = False
        self.need_exception_reraise: bool = True
        # stage info
        self.num_epochs: int = num_epochs
        self.stage: str = stage
        self.is_infer_stage: bool = self.stage.startswith(
            SETTINGS.stage_infer_prefix)
        # epoch info
        self.epoch: int = 1
        # loader info
        self.loader_sample_step: int = 0
        self.loader_batch_step: int = 0
        self.loader_key: str = None
        self.loader_len: int = 0
        self.loader_batch_size = 0
        self.is_train_loader: bool = False
        self.is_valid_loader: bool = False
        self.is_infer_loader: bool = True
        # batch info
        self.batch_size: int = 0

        # logging
        self.expdir: Path = None
        self.logdir: Path = Path(logdir) if logdir is not None else None
        # extra checkpoint data for saving in checkpoint files
        self.checkpoint_data: Dict = checkpoint_data or {}

        # extra
        self.exception: Optional[Exception] = None

        # kwargs
        for key, value in kwargs.items():
            setattr(self, key, value)
Exemplo n.º 22
0
 def reset(self):
     """Reset all statistics"""
     self.statistics = {}
     self._is_ddp = get_rank() > -1
Exemplo n.º 23
0
 def reset(self) -> None:
     """Reset confusion matrix, filling it with zeros."""
     self.conf.fill(0)
     self._is_ddp = get_rank() > -1