Exemplo n.º 1
0
 def _get_logdir(self, config: Dict) -> str:
     timestamp = utils.get_utcnow_time()
     config_hash = utils.get_short_hash(config)
     logdir = f"{timestamp}.{config_hash}"
     distributed_rank = utils.get_rank()
     if distributed_rank > -1:
         logdir = f"{logdir}.rank{distributed_rank:02d}"
     return logdir
Exemplo n.º 2
0
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        experiment, runner, trial_config = utils.prepare_config_api_components(
            expdir=expdir, config=trial_config)
        # @TODO: here we need better solution.
        experiment._trial = trial  # noqa: WPS437

        if experiment.logdir is not None and utils.get_rank() <= 0:
            utils.dump_environment(trial_config, experiment.logdir,
                                   args.configs)
            utils.dump_code(args.expdir, experiment.logdir)

        runner.run_experiment(experiment)

        return runner.best_valid_metrics[runner.main_metric]
Exemplo n.º 3
0
    def _init(
        self,
        log_on_batch_end: bool = False,
        log_on_epoch_end: bool = True,
    ):
        self.log_on_batch_end = log_on_batch_end
        self.log_on_epoch_end = log_on_epoch_end
        self.is_distributed_worker = utils.get_rank() > 0

        if (self.log_on_batch_end and not self.log_on_epoch_end) \
                or (not self.log_on_batch_end and self.log_on_epoch_end):
            self.batch_log_suffix = ""
            self.epoch_log_suffix = ""
        else:
            self.batch_log_suffix = "_batch"
            self.epoch_log_suffix = "_epoch"
Exemplo n.º 4
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp

    experiment, runner, config = utils.prepare_config_api_components(
        expdir=Path(args.expdir), config=config
    )

    if experiment.logdir is not None and utils.get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 5
0
    def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]],
                       **params) -> Optimizer:
        # @TODO 1: refactoring; this method is too long
        # @TODO 2: load state dicts for schedulers & criterion
        layerwise_params = params.pop("layerwise_params", OrderedDict())
        no_bias_weight_decay = params.pop("no_bias_weight_decay", True)

        # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf
        lr_scaling_params = params.pop("lr_linear_scaling", None)
        if lr_scaling_params:
            data_params = dict(self.stages_config[stage]["data_params"])
            batch_size = data_params.get("batch_size")
            per_gpu_scaling = data_params.get("per_gpu_scaling", False)
            distributed_rank = utils.get_rank()
            distributed = distributed_rank > -1
            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus

            base_lr = lr_scaling_params.get("lr")
            base_batch_size = lr_scaling_params.get("base_batch_size", 256)
            lr_scaling = batch_size / base_batch_size
            params["lr"] = base_lr * lr_scaling  # scale default lr
        else:
            lr_scaling = 1.0

        # getting model parameters
        model_key = params.pop("_model", None)
        if model_key is None:
            assert isinstance(
                model, nn.Module
            ), "model is key-value, but optimizer has no specified model"
            model_params = utils.process_model_params(model, layerwise_params,
                                                      no_bias_weight_decay,
                                                      lr_scaling)
        elif isinstance(model_key, str):
            model_params = utils.process_model_params(
                model[model_key],
                layerwise_params,
                no_bias_weight_decay,
                lr_scaling,
            )
        elif isinstance(model_key, (list, tuple)):
            model_params = []
            for model_key_ in model_key:
                model_params_ = utils.process_model_params(
                    model[model_key_],
                    layerwise_params,
                    no_bias_weight_decay,
                    lr_scaling,
                )
                model_params.extend(model_params_)
        else:
            raise ValueError("unknown type of model_params")

        load_from_previous_stage = params.pop("load_from_previous_stage",
                                              False)
        optimizer_key = params.pop("optimizer_key", None)
        optimizer = OPTIMIZERS.get_from_params(**params, params=model_params)

        if load_from_previous_stage and self.stages.index(stage) != 0:
            checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth"
            checkpoint = utils.load_checkpoint(checkpoint_path)

            dict2load = optimizer
            if optimizer_key is not None:
                dict2load = {optimizer_key: optimizer}
            utils.unpack_checkpoint(checkpoint, optimizer=dict2load)

            # move optimizer to device
            device = utils.get_device()
            for param in model_params:
                param = param["params"][0]
                state = optimizer.state[param]
                for key, value in state.items():
                    state[key] = utils.any2device(value, device)

            # update optimizer params
            for key, value in params.items():
                for pg in optimizer.param_groups:
                    pg[key] = value

        return optimizer
Exemplo n.º 6
0
    def get_loaders(
        self,
        stage: str,
        epoch: int = None,
    ) -> "OrderedDict[str, DataLoader]":
        """Returns the loaders for a given stage."""
        data_params = dict(self.stages_config[stage]["data_params"])

        default_batch_size = data_params.pop("batch_size", 1)
        default_num_workers = data_params.pop("num_workers")
        drop_last = data_params.pop("drop_last", False)
        per_gpu_scaling = data_params.pop("per_gpu_scaling", False)
        distributed_rank = utils.get_rank()
        distributed = distributed_rank > -1

        datasets = self.get_datasets(stage=stage, **data_params)

        overridden_loaders_params = data_params.pop("loaders_params", {})
        assert isinstance(
            overridden_loaders_params,
            dict), (f"`overridden_loaders_params` should be a Dict. "
                    f"Got: {overridden_loaders_params}")

        samplers_params = data_params.pop("samplers_params", {})
        assert isinstance(
            samplers_params, dict
        ), f"`samplers_params` should be a Dict. Got: {samplers_params}"

        loaders = OrderedDict()
        for name, ds_ in datasets.items():
            assert isinstance(
                ds_, (Dataset, dict)), f"{ds_} should be Dataset or Dict"

            overridden_loader_params = overridden_loaders_params.pop(name, {})
            assert isinstance(
                overridden_loader_params,
                dict), f"{overridden_loader_params} should be Dict"

            sampler_params = samplers_params.pop(name, None)
            if sampler_params is None:
                if isinstance(ds_, dict) and "sampler" in ds_:
                    sampler = ds_.pop("sampler", None)
                else:
                    sampler = None
            else:
                sampler = SAMPLERS.get_from_params(**sampler_params)
                if isinstance(ds_, dict) and "sampler" in ds_:
                    ds_.pop("sampler", None)

            batch_size = overridden_loader_params.pop("batch_size",
                                                      default_batch_size)
            num_workers = overridden_loader_params.pop("num_workers",
                                                       default_num_workers)

            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus
                num_workers *= num_gpus

            loader_params = {
                "batch_size": batch_size,
                "num_workers": num_workers,
                "pin_memory": torch.cuda.is_available(),
                "drop_last": drop_last,
                **overridden_loader_params,
            }

            if isinstance(ds_, Dataset):
                loader_params["dataset"] = ds_
            elif isinstance(ds_, dict):
                assert ("dataset"
                        in ds_), "You need to specify dataset for dataloader"
                loader_params = utils.merge_dicts(ds_, loader_params)
            else:
                raise NotImplementedError

            if distributed:
                if sampler is not None:
                    if not isinstance(sampler, DistributedSampler):
                        sampler = DistributedSamplerWrapper(sampler=sampler)
                else:
                    sampler = DistributedSampler(
                        dataset=loader_params["dataset"])

            loader_params["shuffle"] = (name.startswith("train")
                                        and sampler is None)

            loader_params["sampler"] = sampler

            if "batch_sampler" in loader_params:
                if distributed:
                    raise ValueError("batch_sampler option is mutually "
                                     "exclusive with distributed")

                for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                    loader_params.pop(k, None)

            if "worker_init_fn" not in loader_params:
                loader_params[
                    "worker_init_fn"] = lambda x: utils.set_global_seed(
                        self.initial_seed + x)

            loaders[name] = DataLoader(**loader_params)

        return loaders