Exemplo n.º 1
0
    def _get_experiment_callbacks(
        experiment: IExperiment,
        stage: str,
    ) -> Dict[str, Callback]:
        """Inner method for `Callbacks` preparation.

        Takes callbacks from the Experiment
        and filters them for distributed master/worker cases.

        Args:
            stage: stage name of interest,
                like "pretrain" / "train" / "finetune" / etc

        Returns:
            OrderedDict[str, Callback]: Ordered dictionary
                with callbacks for current experiment stage.
        """
        callbacks = experiment.get_callbacks(stage)
        callbacks = filter_callbacks_by_node(callbacks)
        callbacks = sort_callbacks_by_order(callbacks)
        return callbacks
Exemplo n.º 2
0
    def on_stage_start(self, runner: "IRunner") -> None:
        """Event handler for stage start.

        For the `IStageBasedRunner` case:

        - prepares loaders - our datasources
        - prepares model components - model, criterion, optimizer, scheduler
        - prepares callbacks for the current stage

        Args:
            runner: IRunner instance.
        """
        super().on_stage_start(runner)

        set_global_seed(self.experiment.initial_seed)
        loaders = self.experiment.get_loaders(stage=self.stage)
        loaders = validate_loaders(loaders)
        # self.loaders = loaders

        set_global_seed(self.experiment.initial_seed)
        model = self.experiment.get_model(self.stage)
        criterion = self.experiment.get_criterion(self.stage)
        optimizer = self.experiment.get_optimizer(self.stage, model)
        scheduler = self.experiment.get_scheduler(self.stage, optimizer)
        model, criterion, optimizer, scheduler, device = process_components(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            distributed_params=self.experiment.distributed_params,
            device=self.device,
        )

        set_global_seed(self.experiment.initial_seed)
        callbacks = self.experiment.get_callbacks(self.stage)
        callbacks = filter_callbacks_by_node(callbacks)
        callbacks = sort_callbacks_by_order(callbacks)

        migrating_params = dict(**self.experiment.get_stage_params(self.stage))
        migrate_from_previous_stage = migrating_params.get(
            "migrate_from_previous_stage", True)
        if (migrate_from_previous_stage
                and getattr(self, "callbacks", None) is not None):
            for key, value in self.callbacks.items():
                if value.scope == CallbackScope.experiment:
                    callbacks[key] = value

        callbacks = sort_callbacks_by_order(callbacks)

        if migrate_from_previous_stage:
            migrating_params.update({
                "global_epoch":
                getattr(self, "global_epoch", 1),
                "global_batch_step":
                getattr(self, "global_batch_step", 0),
                "global_sample_step":
                getattr(self, "global_sample_step", 0),
                "resume":
                getattr(self, "resume", None),
            })

        self._prepare_inner_state(
            stage=self.stage,
            model=model,
            device=device,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            callbacks=callbacks,
            loaders=loaders,
            **migrating_params,
        )
Exemplo n.º 3
0
    def train(
        self,
        *,
        model: Model,
        criterion: Criterion = None,
        optimizer: Optimizer = None,
        scheduler: Scheduler = None,
        datasets: "OrderedDict[str, Union[Dataset, Dict, Any]]" = None,
        loaders: "OrderedDict[str, DataLoader]" = None,
        callbacks: "Union[List[Callback], OrderedDict[str, Callback]]" = None,
        logdir: str = None,
        resume: str = None,
        num_epochs: int = 1,
        valid_loader: str = "valid",
        main_metric: str = "loss",
        minimize_metric: bool = True,
        verbose: bool = False,
        stage_kwargs: Dict = None,
        checkpoint_data: Dict = None,
        fp16: Union[Dict, bool] = None,
        distributed: bool = False,
        check: bool = False,
        overfit: bool = False,
        timeit: bool = False,
        load_best_on_end: bool = False,
        initial_seed: int = 42,
        state_kwargs: Dict = None,
    ) -> None:
        """
        Starts the train stage of the model.

        Args:
            model: model to train
            criterion: criterion function for training
            optimizer: optimizer for training
            scheduler: scheduler for training
            datasets (OrderedDict[str, Union[Dataset, Dict, Any]]): dictionary
                with one or several  ``torch.utils.data.Dataset``
                for training, validation or inference
                used for Loaders automatic creation
                preferred way for distributed training setup
            loaders (OrderedDict[str, DataLoader]): dictionary
                with one or several ``torch.utils.data.DataLoader``
                for training, validation or inference
            callbacks (Union[List[Callback], OrderedDict[str, Callback]]):
                list or dictionary with Catalyst callbacks
            logdir: path to output directory
            resume: path to checkpoint for model
            num_epochs: number of training epochs
            valid_loader: loader name used to calculate
                the metrics and save the checkpoints. For example,
                you can pass `train` and then
                the metrics will be taken from `train` loader.
            main_metric: the key to the name of the metric
                by which the checkpoints will be selected.
            minimize_metric: flag to indicate whether
                the ``main_metric`` should be minimized.
            verbose: if `True`, it displays the status of the training
                to the console.
            stage_kwargs: additional params for stage
            checkpoint_data: additional data to save in checkpoint,
                for example: ``class_names``, ``date_of_training``, etc
            fp16 (Union[Dict, bool]): If not None, then sets training to FP16.
                To use pytorch native amp: ``{"amp": True}``
                To use apex: ``{"apex": True, "opt_level": "O1", ...}``
                    See https://nvidia.github.io/apex/amp.html#properties
                    for more params

                If fp16=True, params by default will be:
                    * ``{"amp": True}`` if torch>=1.6.0
                    * ``{"apex": True, "opt_level": "O1", ...}`` if torch<1.6.0
            distributed: if `True` will start training
                in distributed mode.
                Note: Works only with python scripts. No jupyter support.
            check: if True, then only checks that pipeline is working
                (3 epochs only with 3 batches per loader)
            overfit: if True, then takes only one batch per loader
                for model overfitting, for advance usage please check
                ``BatchOverfitCallback``
            timeit: if True, computes the execution time
                of training process and displays it to the console.
            load_best_on_end: if True, Runner will load
                best checkpoint state (model, optimizer, etc)
                according to validation metrics. Requires specified ``logdir``.
            initial_seed: experiment's initial seed value
            state_kwargs: deprecated, use `stage_kwargs` instead

        Raises:
            NotImplementedError: if both `resume` and `CheckpointCallback`
                already exist
        """
        assert state_kwargs is None or stage_kwargs is None

        fp16 = _resolve_bool_fp16(fp16)

        if resume is not None or load_best_on_end:
            load_on_stage_end = None
            if load_best_on_end:
                load_on_stage_end = "best_full"
                assert logdir is not None, ("For ``load_best_on_end`` feature "
                                            "you need to specify ``logdir``")
            callbacks = sort_callbacks_by_order(callbacks)
            checkpoint_callback_flag = any(
                isinstance(x, CheckpointCallback) for x in callbacks.values())
            if not checkpoint_callback_flag:
                callbacks["_loader"] = CheckpointCallback(
                    resume=resume,
                    load_on_stage_end=load_on_stage_end,
                )
            else:
                raise NotImplementedError("CheckpointCallback already exist")

        experiment = self._experiment_fn(
            stage="train",
            model=model,
            datasets=datasets,
            loaders=loaders,
            callbacks=callbacks,
            logdir=logdir,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            num_epochs=num_epochs,
            valid_loader=valid_loader,
            main_metric=main_metric,
            minimize_metric=minimize_metric,
            verbose=verbose,
            check_time=timeit,
            check_run=check,
            overfit=overfit,
            stage_kwargs=stage_kwargs or state_kwargs,
            checkpoint_data=checkpoint_data,
            distributed_params=fp16,
            initial_seed=initial_seed,
        )
        self.experiment = experiment
        distributed_cmd_run(self.run_experiment, distributed)
Exemplo n.º 4
0
    def infer(
        self,
        *,
        model: Model,
        datasets: "OrderedDict[str, Union[Dataset, Dict, Any]]" = None,
        loaders: "OrderedDict[str, DataLoader]" = None,
        callbacks: "Union[List[Callback], OrderedDict[str, Callback]]" = None,
        logdir: str = None,
        resume: str = None,
        verbose: bool = False,
        stage_kwargs: Dict = None,
        fp16: Union[Dict, bool] = None,
        check: bool = False,
        timeit: bool = False,
        initial_seed: int = 42,
        state_kwargs: Dict = None,
    ) -> None:
        """
        Starts the inference stage of the model.

        Args:
            model: model for inference
            datasets (OrderedDict[str, Union[Dataset, Dict, Any]]): dictionary
                with one or several  ``torch.utils.data.Dataset``
                for training, validation or inference
                used for Loaders automatic creation
                preferred way for distributed training setup
            loaders (OrderedDict[str, DataLoader]): dictionary
                with one or several ``torch.utils.data.DataLoader``
                for training, validation or inference
            callbacks (Union[List[Callback], OrderedDict[str, Callback]]):
                list or dictionary with Catalyst callbacks
            logdir: path to output directory
            resume: path to checkpoint to use for resume
            verbose: if `True`, it displays the status of the training
                to the console.
            stage_kwargs: additional stage params
            fp16 (Union[Dict, bool]): fp16 settings (same as in `train`)
            check: if True, then only checks that pipeline is working
                (3 epochs only)
            timeit: if True, computes the execution time
                of training process and displays it to the console.
            initial_seed: experiment's initial seed value
            state_kwargs: deprecated, use `stage_kwargs` instead

        Raises:
            NotImplementedError: if both `resume` and `CheckpointCallback`
                already exist
        """
        assert state_kwargs is None or stage_kwargs is None

        fp16 = _resolve_bool_fp16(fp16)

        if resume is not None:
            callbacks = sort_callbacks_by_order(callbacks)
            checkpoint_callback_flag = any(
                isinstance(x, CheckpointCallback) for x in callbacks.values())
            if not checkpoint_callback_flag:
                callbacks["loader"] = CheckpointCallback(resume=resume)
            else:
                raise NotImplementedError("CheckpointCallback already exist")

        experiment = self._experiment_fn(
            stage="infer",
            model=model,
            datasets=datasets,
            loaders=loaders,
            callbacks=callbacks,
            logdir=logdir,
            verbose=verbose,
            check_time=timeit,
            check_run=check,
            stage_kwargs=stage_kwargs or state_kwargs,
            distributed_params=fp16,
            initial_seed=initial_seed,
        )
        self.run_experiment(experiment)
Exemplo n.º 5
0
    def _prepare_for_stage(self, stage: str):
        """Inner method to prepare `Runner` for the specified stage.

        Sets `Experiment` initial seed.
        Prepares experiment components with `self._get_experiment_components`.
        Prepares callbacks with `self._get_experiment_callbacks`.
        Prepares inner state with `self._prepare_inner_state`
        Additionally sets `Experiment` datasources for specified stage.

        Args:
            stage: stage name of interest,
                like "pretrain" / "train" / "finetune" / etc
        """
        set_global_seed(self.experiment.initial_seed)
        loaders = self.experiment.get_loaders(stage=stage)
        loaders = validate_loaders(loaders)
        self.loaders = loaders

        set_global_seed(self.experiment.initial_seed)
        (
            model,
            criterion,
            optimizer,
            scheduler,
            device,
        ) = self._get_experiment_components(experiment=self.experiment,
                                            stage=stage,
                                            device=self.device)

        set_global_seed(self.experiment.initial_seed)
        callbacks = self._get_experiment_callbacks(experiment=self.experiment,
                                                   stage=stage)

        migrating_params = dict(**self.experiment.get_stage_params(stage))
        migrate_from_previous_stage = migrating_params.get(
            "migrate_from_previous_stage", True)
        if (migrate_from_previous_stage
                and getattr(self, "callbacks", None) is not None):
            for key, value in self.callbacks.items():
                if value.scope == CallbackScope.experiment:
                    callbacks[key] = value

        callbacks = sort_callbacks_by_order(callbacks)

        if migrate_from_previous_stage:
            migrating_params.update({
                "global_epoch":
                getattr(self, "global_epoch", 1),
                "global_batch_step":
                getattr(self, "global_batch_step", 0),
                "global_sample_step":
                getattr(self, "global_sample_step", 0),
                "resume":
                getattr(self, "resume", None),
            })

        self._prepare_inner_state(
            stage=stage,
            model=model,
            device=device,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            callbacks=callbacks,
            loaders=getattr(self, "loaders", None),
            **migrating_params,
        )
Exemplo n.º 6
0
    def __init__(
        self,
        model: Model,
        datasets: "OrderedDict[str, Union[Dataset, Dict, Any]]" = None,
        loaders: "OrderedDict[str, DataLoader]" = None,
        callbacks: "Union[OrderedDict[str, Callback], List[Callback]]" = None,
        logdir: str = None,
        stage: str = "train",
        criterion: Criterion = None,
        optimizer: Optimizer = None,
        scheduler: Scheduler = None,
        trial: Any = None,
        num_epochs: int = 1,
        valid_loader: str = "valid",
        main_metric: str = "loss",
        minimize_metric: bool = True,
        verbose: bool = False,
        check_time: bool = False,
        check_run: bool = False,
        overfit: bool = False,
        stage_kwargs: Dict = None,
        checkpoint_data: Dict = None,
        distributed_params: Dict = None,
        initial_seed: int = 42,
    ):
        """
        Args:
            model: model
            datasets (OrderedDict[str, Union[Dataset, Dict, Any]]): dictionary
                with one or several  ``torch.utils.data.Dataset``
                for training, validation or inference
                used for Loaders automatic creation
                preferred way for distributed training setup
            loaders (OrderedDict[str, DataLoader]): dictionary
                with one or several ``torch.utils.data.DataLoader``
                for training, validation or inference
            callbacks (Union[List[Callback], OrderedDict[str, Callback]]):
                list or dictionary with Catalyst callbacks
            logdir: path to output directory
            stage: current stage
            criterion: criterion function
            optimizer: optimizer
            scheduler: scheduler
            trial : hyperparameters optimization trial.
                Used for integrations with Optuna/HyperOpt/Ray.tune.
            num_epochs: number of experiment's epochs
            valid_loader: loader name used to calculate
                the metrics and save the checkpoints. For example,
                you can pass `train` and then
                the metrics will be taken from `train` loader.
            main_metric: the key to the name of the metric
                by which the checkpoints will be selected.
            minimize_metric: flag to indicate whether
                the ``main_metric`` should be minimized.
            verbose: if True, it displays the status of the training
                to the console.
            check_time: if True, computes the execution time
                of training process and displays it to the console.
            check_run: if True, we run only 3 batches per loader
                and 3 epochs per stage to check pipeline correctness
            overfit: if True, then takes only one batch per loader
                for model overfitting, for advance usage please check
                ``BatchOverfitCallback``
            stage_kwargs: additional stage params
            checkpoint_data: additional data to save in checkpoint,
                for example: ``class_names``, ``date_of_training``, etc
            distributed_params: dictionary with the parameters
                for distributed and FP16 method
            initial_seed: experiment's initial seed value
        """
        assert (
            datasets is not None or loaders is not None
        ), "Please specify the data sources"

        self._model = model
        self._loaders, self._valid_loader = self._get_loaders(
            loaders=loaders,
            datasets=datasets,
            stage=stage,
            valid_loader=valid_loader,
            initial_seed=initial_seed,
        )
        self._callbacks = sort_callbacks_by_order(callbacks)

        self._criterion = criterion
        self._optimizer = optimizer
        self._scheduler = scheduler

        self._trial = trial

        self._initial_seed = initial_seed
        self._logdir = logdir
        self._stage = stage
        self._num_epochs = num_epochs
        self._main_metric = main_metric
        self._minimize_metric = minimize_metric
        self._verbose = verbose
        self._check_time = check_time
        self._check_run = check_run
        self._overfit = overfit
        self._stage_kwargs = stage_kwargs or {}
        self._checkpoint_data = checkpoint_data or {}
        self._distributed_params = distributed_params or {}