示例#1
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        det.TrialContext.__init__(self, *args, **kwargs)
        pytorch._PyTorchReducerContext.__init__(self,
                                                self.distributed.allgather)
        self._per_slot_batch_size, self._global_batch_size = util.calculate_batch_sizes(
            self.get_hparams(),
            self.env.experiment_config.slots_per_trial(),
            "PyTorchTrial",
        )

        self._distributed_backend = det._DistributedBackend()

        self.device = self._init_device()

        # Track which types we have issued warnings for in to_device().
        self._to_device_warned_types = set()  # type: Set[Type]

        # The following attributes are initialized during the lifetime of
        # a PyTorchTrialContext.
        self.models = []  # type: List[nn.Module]
        self.optimizers = []  # type: List[torch.optim.Optimizer]
        self.profiler = None  # type: Any
        self.lr_schedulers = []  # type: List[pytorch.LRScheduler]
        self._epoch_len = None  # type: Optional[int]

        # Keep a map of wrapped models to their original input forms, which is needed
        # by torch DDP and apex to initialize in the correct order
        self._wrapped_models = {}  # type: Dict[nn.Module, nn.Module]

        # Use a main model to contain all of the models because when using horovod
        # to broadcast the states of models we want to avoid name conflicts for these
        # states so we set all the models to be sub-module of the main model with
        # different names using __setattr__ and use the state_dict of the main model
        # for broadcasting. Note that broadcast_parameters only accepts state_dict()
        # although its doc says it also accepts named_parameters()
        self._main_model = nn.Module()
        self._scaler = None
        self._use_apex = False
        self._loss_ids = {}  # type: Dict[torch.Tensor, int]
        self._last_backward_batch_idx = None  # type: Optional[int]
        self._current_batch_idx = None  # type: Optional[int]

        self.experimental = pytorch.PyTorchExperimentalContext(self)
        self._reducers = pytorch._PyTorchReducerContext()
        self._determined_profiler = None  # type: Optional[profiler.ProfilerAgent]

        optimizations_config = self.env.experiment_config.get_optimizations_config(
        )
        self._aggregation_frequency = cast(
            int, optimizations_config.get("aggregation_frequency"))
        self._fp16_compression = cast(
            bool, optimizations_config.get("gradient_compression"))
        self._average_aggregated_gradients = cast(
            bool, optimizations_config.get("average_aggregated_gradients"))
        self._average_training_metrics = cast(
            bool, optimizations_config.get("average_training_metrics"))
示例#2
0
def make_trial_controller_from_trial_implementation(
    trial_class: Type[det.Trial],
    hparams: Dict,
    workloads: workload.Stream,
    scheduling_unit: int = 1,
    trial_seed: int = 0,
    exp_config: Optional[Dict] = None,
    checkpoint_dir: Optional[str] = None,
    latest_checkpoint: Optional[str] = None,
    steps_completed: int = 0,
    expose_gpus: bool = False,
) -> det.TrialController:
    if not exp_config:
        assert hasattr(
            trial_class, "_searcher_metric"
        ), "Trial classes for unit tests should be annotated with a _searcher_metric attribute"
        searcher_metric = trial_class._searcher_metric  # type: ignore
        exp_config = make_default_exp_config(hparams,
                                             scheduling_unit,
                                             searcher_metric,
                                             checkpoint_dir=checkpoint_dir)
    env = make_default_env_context(
        hparams=hparams,
        experiment_config=exp_config,
        trial_seed=trial_seed,
        latest_checkpoint=latest_checkpoint,
        steps_completed=steps_completed,
        expose_gpus=expose_gpus,
    )

    storage_manager = det.common.storage.SharedFSStorageManager(checkpoint_dir
                                                                or "/tmp")
    core_context = core._dummy_init(storage_manager=storage_manager)

    distributed_backend = det._DistributedBackend()

    controller_class = trial_class.trial_controller_class
    assert controller_class is not None
    controller_class.pre_execute_hook(env, distributed_backend)

    trial_context = trial_class.trial_context_class(core_context, env)
    trial_inst = trial_class(trial_context)

    return controller_class.from_trial(
        trial_inst=trial_inst,
        context=trial_context,
        env=env,
        workloads=workloads,
    )
def test_one_batch(
    trial_class: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
) -> Any:
    # Override the scheduling_unit value to 1.
    config = {**(config or {}), "scheduling_unit": 1}

    logging.info("Running a minimal test experiment locally")
    with tempfile.TemporaryDirectory() as checkpoint_dir:
        core_context, env = det._make_local_execution_env(
            managed_training=True,
            test_mode=True,
            config=config,
            checkpoint_dir=checkpoint_dir,
            limit_gpus=1,
        )

        workloads = _make_test_workloads(env.experiment_config)
        logging.info(f"Using hyperparameters: {env.hparams}.")
        logging.debug(
            f"Using a test experiment config: {env.experiment_config}.")

        distributed_backend = det._DistributedBackend()
        controller_class = trial_class.trial_controller_class
        assert controller_class is not None
        controller_class.pre_execute_hook(env, distributed_backend)

        trial_context = trial_class.trial_context_class(core_context, env)
        logging.info(f"Creating {trial_class.__name__}.")
        trial_inst = trial_class(trial_context)

        controller = controller_class.from_trial(
            trial_inst=trial_inst,
            context=trial_context,
            env=env,
            workloads=workloads,
        )

        controller.run()

        logging.info("The test experiment passed.")
        logging.info(
            "Note: to submit an experiment to the cluster, change local parameter to False"
        )
示例#4
0
def main(train_entrypoint: str) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # TODO: refactor data_layer, and profiling to to not use the cli_cert.
    certs.cli_cert = certs.default_load(info.master_url)

    # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras.
    # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into
    # the lowest layers of the harness code; it's too large of an object to be easily mockable,
    # which is part of why building local training mode has always been a challenge.
    #
    # A better pattern is to pass in exactly the information that is necessary at each layer.  We
    # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the
    # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we
    # continue with the legacy strategy.

    env = det.EnvContext(
        master_url=info.master_url,
        master_cert_file=info.master_cert_file,
        master_cert_name=info.master_cert_name,
        experiment_config=info.trial._config,
        hparams=info.trial.hparams,
        latest_checkpoint=info.latest_checkpoint,
        steps_completed=info.trial._steps_completed,
        use_gpu=bool(info.gpu_uuids),
        container_gpus=info.gpu_uuids,
        slot_ids=info.slot_ids,
        debug=info.trial._debug,
        det_trial_unique_port_offset=info.trial._unique_port_offset,
        det_trial_id=str(info.trial.trial_id),
        det_experiment_id=str(info.trial.experiment_id),
        det_agent_id=info.agent_id,
        det_cluster_id=info.cluster_id,
        trial_seed=info.trial.trial_seed,
        trial_run_id=info.trial._trial_run_id,
        allocation_id=info.allocation_id,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    det.common.set_logger(env.debug)
    logging.debug("Starting harness.")

    with maybe_periodic_stacktraces(env.debug):
        # Step 1: Load user code.
        # We can't build a core.Context without rank information, and we can't gather rank
        # information until the distributed backend is initialized, and we can't initialize the
        # correct distributed backend until we know which Trial class the user implemented.
        trial_class = load.trial_class_from_entrypoint(train_entrypoint)
        controller_class = load.get_trial_controller_class(trial_class)
        if info.container_rank == 0:
            try:
                analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class))
            except Exception as e:
                logging.debug(f"Cannot send analytics: {e}")

        # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc).
        distributed_backend = det._DistributedBackend()
        controller_class.pre_execute_hook(env, distributed_backend)

        # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object.
        # For harness.py, we only support a fixed set of Determined-provided launch layers, since
        # the TrialControllers only support a fixed set of launch layers.
        distributed = None
        if distributed_backend.use_horovod():
            distributed = core.DistributedContext.from_horovod(horovod.hvd)
        elif distributed_backend.use_deepspeed():
            distributed = core.DistributedContext.from_deepspeed()
        elif distributed_backend.use_torch():
            distributed = core.DistributedContext.from_torch_distributed()
        elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError(
                "In multi-slot tasks, the determined.exec.harness module must not be invoked "
                "directly.  Instead, it must be wrapped in one of the following launch layers: "
                "determined.launch.horovod, determined.launch.deepspeed"
            )

        # Step 4: Let core.init() create the core.Context.
        with core.init(
            distributed=distributed,
            preempt_mode=core.PreemptMode.ChiefOnly,
            tensorboard_mode=core.TensorboardMode.MANUAL,
        ) as core_context:
            trial_context = trial_class.trial_context_class(core_context, env)

            # Step 4: Instantiate the user's Trial.
            trial_inst = trial_class(trial_context)

            # Step 5: Create a TrialController and execute training
            logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.")
            controller = controller_class.from_trial(
                trial_inst=trial_inst,
                context=trial_context,
                env=env,
            )

            controller.run()

    return 0