Пример #1
0
    def _configure_amp(self) -> None:
        if self.use_amp():
            if self.hvd_config.use:
                check.eq(
                    self.hvd_config.aggregation_frequency,
                    1,
                    "Mixed precision training (AMP) is not supported with "
                    "aggregation frequency > 1.",
                )

            check.true(
                torch.cuda.is_available(),
                "Mixed precision training (AMP) is supported only on GPU slots.",
            )
            check.false(
                not self.hvd_config.use and self.n_gpus > 1,
                "To enable mixed precision training (AMP) for parallel training, "
                'please set `resources["optimized_parallel"] = True`.',
            )

            logging.info(
                f"Enabling mixed precision training with opt_level: {self._get_amp_setting()}."
            )
            self.context.model, self.context.optimizer = apex.amp.initialize(
                self.context.model,
                self.context.optimizer,
                opt_level=self._get_amp_setting(),
                verbosity=1 if self.is_chief or self.env.experiment_config.debug_enabled() else 0,
            )
Пример #2
0
    def _prepare_metrics_reducers(self, keys: Any) -> Dict[str, Reducer]:
        metrics_reducers = {}  # type: Dict[str, Reducer]
        if isinstance(self.trial.evaluation_reducer(), Dict):
            metrics_reducers = cast(Dict[str, Any],
                                    self.trial.evaluation_reducer())
            check.eq(
                metrics_reducers.keys(),
                keys,
                "Please provide a single evaluation reducer or "
                "provide a reducer for every validation metric. "
                f"Expected keys: {keys}, provided keys: {metrics_reducers.keys()}.",
            )
        elif isinstance(self.trial.evaluation_reducer(), Reducer):
            for key in keys:
                metrics_reducers[key] = cast(Reducer,
                                             self.trial.evaluation_reducer())

        for key in keys:
            check.true(
                isinstance(metrics_reducers[key], Reducer),
                "Please select `det.pytorch.Reducer` "
                "for reducing validation metrics.",
            )

        return metrics_reducers
Пример #3
0
    def __init__(
        self,
        context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext],
        train_config: keras.TFKerasTrainConfig,
    ) -> None:
        super().__init__(context=context)

        self._training_cacheable = self._context.experimental.get_train_cacheable(
        )
        self._training_dataset = train_config.training_data

        check.true(
            self._training_cacheable.is_decorator_used(),
            "Please use `@context.experimental.cache_train_dataset(dataset_name, dataset_version)`"
            " for the training dataset.",
        )
        check.false(
            self._context.dataset_initialized,
            "Please do not use: `context.wrap_dataset(dataset)` if using "
            "`@context.experimental.cache_train_dataset()` and "
            "`@context.experimental.cache_validation_dataset()`.",
        )
        check.is_instance(
            train_config.training_data,
            tf.data.Dataset,
            "Pass in a `tf.data.Dataset` object if using "
            "`@context.experimental.cache_train_dataset()`.",
        )
Пример #4
0
    def _combine_metrics_across_processes(
        self, metrics: Dict[str, Any], num_batches: int
    ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]:
        # The chief receives the metric from every other training process.
        check.true(self.hvd_config.use)

        metrics_lists = {}  # type: Dict[str, Any]
        batches_per_process = []  # type: List[int]
        if self.is_chief:
            self.train_process_comm_chief = cast(ipc.ZMQServer,
                                                 self.train_process_comm_chief)
            worker_metrics = self.train_process_comm_chief.barrier(
                num_connections=hvd.size() - 1)
            worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics)

            for metric_name in metrics.keys():
                metrics_lists[metric_name] = [metrics[metric_name]]
                for worker_metric in worker_metrics:
                    metrics_lists[metric_name].append(
                        worker_metric.metrics[metric_name])

            batches_per_process.append(num_batches)
            for worker_metric in worker_metrics:
                batches_per_process.append(worker_metric.num_batches)

            return metrics_lists, batches_per_process
        else:
            self.train_process_comm_worker = cast(
                ipc.ZMQClient, self.train_process_comm_worker)
            self.train_process_comm_worker.barrier(message=ipc.MetricsInfo(
                metrics=metrics, num_batches=num_batches))
            return None, None
Пример #5
0
    def _initialize_train_process_comm(self) -> None:
        check.true(self.hvd_config.use)

        srv_pub_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_1 +
                        self.env.det_trial_unique_port_offset)
        srv_pull_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_2 +
                         self.env.det_trial_unique_port_offset)

        if self.is_chief:
            logging.debug(
                f"Chief setting up server with ports {srv_pub_port}/{srv_pull_port}."
            )
            self.train_process_comm_chief = ipc.ZMQBroadcastServer(
                num_connections=self.env.experiment_config.slots_per_trial() -
                1,
                pub_port=srv_pub_port,
                pull_port=srv_pull_port,
            )
        else:
            chief_ip_address = self.rendezvous_info.get_ip_addresses()[0]
            logging.debug(f"Non-Chief {hvd.rank()} setting up comm to "
                          f"{chief_ip_address} w/ ports "
                          f"{srv_pub_port}/{srv_pull_port}.")
            self.train_process_comm_worker = ipc.ZMQBroadcastClient(
                srv_pub_url=f"tcp://{chief_ip_address}:{srv_pub_port}",
                srv_pull_url=f"tcp://{chief_ip_address}:{srv_pull_port}",
            )
Пример #6
0
 def set_runpy_trial_result(
     cls, trial_cls: Type[det.Trial], controller_cls: Type[det.TrialController]
 ) -> None:
     check.true(cls.get_instance().controller_cls is None, "Please don't load twice.")
     cls.get_instance().trial_cls = trial_cls
     cls.get_instance().controller_cls = controller_cls
     raise det.errors.StopLoadingImplementation()
Пример #7
0
    def _init_run_config(
            self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig:
        logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .")

        session_config = config.session_config
        train_distribute = None
        eval_distribute = None

        # The default session should already be defined, here we also set the session
        # for the estimator itself.
        self._init_session_config(session_config, self.env, self.hvd_config)

        if not self.hvd_config.use and len(self.env.container_gpus) > 1:
            check.true(len(self.rendezvous_info.get_addrs()) == 1)
            train_distribute = tf.distribute.MirroredStrategy()
            eval_distribute = tf.distribute.MirroredStrategy()

        config = config.replace(
            model_dir=str(self.estimator_dir),
            tf_random_seed=self.env.trial_seed,
            save_checkpoints_steps=None,
            # `train_and_evaluate()` requires that either
            # `save_checkpoints_steps` or `save_checkpoints_secs` is
            # set to greater than 0.
            save_checkpoints_secs=VERY_LARGE_NUMBER,
            session_config=session_config,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute,
            experimental_distribute=None,
        )
        logging.debug(f"Initialized RunConfig with args: {config}.")
        return config
Пример #8
0
def decode_bytes(s: str) -> str:
    r"""
    Hasura sends over any bytea value as the two-character string '\x' followed by the hex encoding
    of the bytes. This function turns such a value into the corresponding string.
    """
    check.true(s.startswith(r"\x"), "Invalid log value received")
    return bytes.fromhex(s[2:]).decode()
Пример #9
0
 def set_runpy_native_result(
         cls, context: det.NativeContext,
         controller_cls: Type[det.TrialController]) -> None:
     check.true(cls.get_instance().controller_cls is None,
                "Please don't load twice.")
     cls.get_instance().context = context
     cls.get_instance().controller_cls = controller_cls
Пример #10
0
 def average_metrics(self, metrics: Dict[str,
                                         Any]) -> Optional[Dict[str, Any]]:
     check.true(self.hvd_config.use)
     if self.is_chief:
         self.train_process_comm_chief = cast(ipc.ZMQBroadcastServer,
                                              self.train_process_comm_chief)
         logging.debug(
             f"Chief {hvd.rank()} beginning receiving validation metrics.")
         worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(
             lambda: None)
         self.train_process_comm_chief.broadcast(None)
         logging.debug(
             f"Chief {hvd.rank()} done receiving validation metrics.")
         for metric_name in metrics:
             if isinstance(metrics[metric_name], numbers.Number):
                 metrics[metric_name] /= hvd.size()
             else:
                 logging.warning(
                     f"Skipping averaging metric: {metric_name}.")
         for metric_name in metrics.keys():
             for worker_metric in worker_metrics:
                 if isinstance(worker_metric[metric_name], numbers.Number):
                     metrics[metric_name] += worker_metric[
                         metric_name] / hvd.size()
         return metrics
     else:
         self.train_process_comm_worker = cast(
             ipc.ZMQBroadcastClient, self.train_process_comm_worker)
         logging.debug(f"Worker {hvd.rank()} sending metrics.")
         self.train_process_comm_worker.send(metrics)
         # Synchronize with the chief so that there is no risk of accidentally calling send()
         # for a future gather before all workers have called send() on this gather.
         _ = self.train_process_comm_worker.recv()
         return None
Пример #11
0
 def average_metrics(self, metrics: Dict[str,
                                         Any]) -> Optional[Dict[str, Any]]:
     # The chief receives the metric from every worker and computes
     # the average.
     check.true(self.hvd_config.use)
     if self.is_chief:
         self.train_process_comm_chief = cast(ipc.ZMQServer,
                                              self.train_process_comm_chief)
         logging.debug(
             f"Chief {hvd.rank()} beginning receiving validation metrics.")
         worker_metrics = self.train_process_comm_chief.barrier(
             num_connections=hvd.size() - 1)
         logging.debug(
             f"Chief {hvd.rank()} done receiving validation metrics.")
         for metric_name in metrics:
             if isinstance(metrics[metric_name], numbers.Number):
                 metrics[metric_name] /= hvd.size()
             else:
                 logging.warning(
                     f"Skipping averaging metric: {metric_name}.")
         for metric_name in metrics.keys():
             for worker_metric in worker_metrics:
                 if isinstance(worker_metric[metric_name], numbers.Number):
                     metrics[metric_name] += worker_metric[
                         metric_name] / hvd.size()
         return metrics
     else:
         self.train_process_comm_worker = cast(
             ipc.ZMQClient, self.train_process_comm_worker)
         logging.debug(f"Worker {hvd.rank()} sending metrics.")
         self.train_process_comm_worker.barrier(message=metrics)
         return None
Пример #12
0
    def _init_run_config(
            self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig:
        logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .")

        session_config = config.session_config
        train_distribute = None
        eval_distribute = None
        if self.hvd_config.use:
            if session_config is None:
                session_config = tf.compat.v1.ConfigProto()
            session_config.gpu_options.allow_growth = True
            session_config.gpu_options.visible_device_list = self.env.slot_ids[
                horovod.hvd.local_rank()]
        elif len(self.env.container_gpus) > 1:
            check.true(len(self.rendezvous_info.get_addrs()) == 1)
            train_distribute = tf.distribute.MirroredStrategy()
            eval_distribute = tf.distribute.MirroredStrategy()

        config = config.replace(
            model_dir=str(self.estimator_dir),
            tf_random_seed=self.env.trial_seed,
            save_checkpoints_steps=None,
            # `train_and_evaluate()` requires that either
            # `save_checkpoints_steps` or `save_checkpoints_secs` is
            # set to greater than 0.
            save_checkpoints_secs=VERY_LARGE_NUMBER,
            session_config=session_config,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute,
            experimental_distribute=None,
        )
        logging.debug(f"Initialized RunConfig with args: {config}.")
        return config
Пример #13
0
    def after_run(self, run_context: tf.estimator.SessionRunContext,
                  run_values: tf.estimator.SessionRunValues) -> None:
        # Check for optimizer creation here because when model_fn is passed in as a closure,
        # the optimizer is not initialized until the first training step.
        check.true(
            self.estimator_trial_controller.context.optimizer_initialized,
            "Please pass your optimizer into "
            "`det.estimator.wrap_optimizer(optimizer)` "
            "right after creating it.",
        )
        self._session = run_context.session
        self._current_global_step = run_values.results["global_step"]

        self.num_batches = cast(int, self.num_batches)
        self._collect_batch_metrics(run_values)
        self.batches_processed_in_step += 1
        if self.batches_processed_in_step < self.num_batches:
            return

        # TODO: Average training results across GPUs. This might
        # degrade performance due to an increase in communication.

        # Loss training metric is sometimes called `loss_1` instead of `loss`.
        for step_metrics in self.step_metrics:
            if "loss" not in step_metrics and "loss_1" in step_metrics:
                step_metrics["loss"] = step_metrics["loss_1"]

        # Send the result of the training step back to the main process.
        check.is_not_none(self.train_response_func,
                          "no response_func at end of train_for_step")
        self.train_response_func = cast(workload.ResponseFunc,
                                        self.train_response_func)
        if self.estimator_trial_controller.is_chief:
            response = {
                "metrics":
                det.util.make_metrics(self.batches_processed_in_step,
                                      self.step_metrics),
                "stop_requested":
                self.estimator_trial_controller.context.get_stop_requested(),
                "invalid_hp":
                False,
            }
            self.train_response_func(response)
        else:
            self.train_response_func(workload.Skipped())

        # Reset step counter and clear the step metrics from memory.
        self.train_response_func = None
        self.batches_processed_in_step = 0
        self.step_metrics = []

        estimator._cleanup_after_train_step(
            self.estimator_trial_controller.estimator_dir)

        # Re-enter the control loop (block on receiving the next instruction)
        self.control_loop()
Пример #14
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran",
                                                      False)
        if check_startup_hook_ran:
            check.true(os.path.isfile("startup-hook-ran"),
                       "File should exists.")

        self.chaos = random.SystemRandom()
        self._batch_size = self.context.get_per_slot_batch_size()
        self.chaos_probability = self.env.hparams.get("chaos_probability", 0)
        self.chaos_probability_train = self.env.hparams.get(
            "chaos_probability_train")
        self.chaos_probability_validate = self.env.hparams.get(
            "chaos_probability_validate")
        self.chaos_probability_checkpoint = self.env.hparams.get(
            "chaos_probability_checkpoint")
        self.fail_on_first_validation = self.env.hparams.get(
            "fail_on_first_validation", "")
        self.fail_on_chechpoint_save = self.env.hparams.get(
            "fail_on_chechpoint_save", "")
        self.validation_set_size = self.env.hparams.get(
            "validation_set_size", 32 * 32)
        self.train_batch_secs = self.env.hparams.get("training_batch_seconds",
                                                     0)
        self.validation_secs = self.env.hparams.get(
            "validation_seconds",
            self.validation_set_size * self.train_batch_secs /
            self._batch_size,
        )
        self.num_training_metrics = self.env.hparams.get(
            "num_training_metrics", 1)
        assert self.num_training_metrics > 0
        self.num_validation_metrics = self.env.hparams.get(
            "num_validation_metrics", 1)
        assert self.num_validation_metrics > 0
        self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0)
        self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0)
        self.metrics_progression = self.env.hparams.get(
            "metrics_progression", "decreasing")
        assert self.metrics_progression in ("increasing", "decreasing",
                                            "constant")
        self.metrics_base = self.env.hparams.get("metrics_base", 0.9)
        assert 0 < self.metrics_base < 1
        self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0)
        assert 0 <= self.metrics_sigma
        self.write_null = self.env.hparams.get("write_null", False)

        self.request_stop = self.env.hparams.get("request_stop", False)

        if self.load_path is None:
            self.trained_steps = collections.Counter()
        else:
            self.load(self.load_path)
Пример #15
0
 def get_runpy_result(
     cls,
 ) -> Tuple[Optional[det.NativeContext], Optional[Type[det.Trial]], Type[det.TrialController]]:
     check.true(
         cls.get_instance().controller_cls is not None, "Please load native implementation."
     )
     return (
         cls.get_instance().context,
         cls.get_instance().trial_cls,
         cast(Type[det.TrialController], cls.get_instance().controller_cls),
     )
Пример #16
0
    def _check_if_trial_supports_configurations(self,
                                                env: det.EnvContext) -> None:
        if self.env.experiment_config.mixed_precision_enabled():
            check.true(
                self.supports_mixed_precision(),
                "Mixed precision training is not supported for this framework interface. "
                'Please set `mixed_precision = "O0"`.',
            )

        if env.experiment_config.averaging_training_metrics_enabled():
            check.true(self.supports_averaging_training_metrics())
Пример #17
0
    def __init__(
        self,
        context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext],
        train_config: keras.TFKerasTrainConfig,
    ) -> None:
        super().__init__(context=context)

        check.true(
            self._context.dataset_initialized,
            "Please use: `context.wrap_dataset(dataset)` if using `tf.data.Dataset`.",
        )

        self._validation_dataset = train_config.validation_data
Пример #18
0
def load_native_implementation_controller(
    env: det.EnvContext,
    workloads: workload.Stream,
    load_path: Optional[pathlib.Path],
    rendezvous_info: det.RendezvousInfo,
    hvd_config: horovod.HorovodContext,
) -> det.TrialController:
    check.true(
        env.experiment_config.native_enabled(),
        "Experiment configuration does not have an internal.native "
        f"configuration: {env.experiment_config}",
    )

    context, trial_class, controller_class = load.load_native_implementation(
        env, hvd_config)

    if trial_class is not None:
        return load_controller_from_trial(
            trial_class=trial_class,
            env=env,
            workloads=workloads,
            load_path=load_path,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )

    else:
        # Framework-specific native implementation.
        check.is_not_none(
            controller_class,
            "The class attribute `trial_controller_class` is "
            "None; please set it the correct subclass of `det.TrialController`",
        )
        check.is_subclass(
            controller_class,
            det.TrialController,
            "The class attribute `trial_controller_class` is "
            "not a valid subclass of `det.TrialController`",
        )
        logging.info(
            f"Creating {controller_class.__name__} with {type(context).__name__}."
        )
        return cast(det.TrialController, controller_class).from_native(
            context=cast(det.NativeContext, context),
            env=env,
            workloads=workloads,
            load_path=load_path,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )
Пример #19
0
def convert_notebook_to_python_script(notebook_path: str) -> str:
    check.check_true(
        notebook_path.endswith(".ipynb"), f"Notebook file {notebook_path} must has a suffix .ipynb"
    )
    processed_cells_path = f"{notebook_path[:-6]}__det__.py"

    with open(notebook_path, "r") as f1, open(processed_cells_path, "w") as f2:
        obj = json.load(f1)
        check.true("cells" in obj, f"Invalid notebook file {notebook_path}")
        for cell in obj["cells"]:
            if cell["cell_type"] == "code":
                lines = [line for line in cell["source"] if not line.lstrip().startswith("!")]
                f2.writelines(lines)
                f2.write("\n")
    return processed_cells_path
Пример #20
0
 def restore_path(self, metadata: StorageMetadata) -> Iterator[str]:
     """
     Prepare a local directory exposing the checkpoint. Do some simple checks to make sure the
     configuration seems reasonable.
     """
     storage_dir = os.path.join(self._base_path, metadata.storage_id)
     check.true(
         os.path.exists(storage_dir),
         "Storage directory does not exist: {}. Please verify "
         "that you are using the correct configuration value for "
         "checkpoint_storage.host_path".format(storage_dir),
     )
     check.true(
         os.path.isdir(storage_dir),
         "Checkpoint path is not a directory: {}".format(storage_dir))
     yield storage_dir
    def _average_training_metrics(
            self, per_batch_metrics: List[Dict[str,
                                               Any]]) -> List[Dict[str, Any]]:
        """Average training metrics across GPUs"""
        check.true(self.hvd_config.use,
                   "Can only average training metrics in multi-GPU training.")
        metrics_timeseries = util._list_to_dict(per_batch_metrics)

        # combined_timeseries is: dict[metric_name] -> 2d-array.
        # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx].
        combined_timeseries, _ = self._combine_metrics_across_processes(
            metrics_timeseries, num_batches=len(per_batch_metrics))

        # If the value for a metric is a single-element array, the averaging process will
        # change that into just the element. We record what metrics are single-element arrays
        # so we can wrap them in an array later (for perfect compatibility with non-averaging
        # codepath).
        array_metrics = []
        for metric_name in per_batch_metrics[0].keys():
            if isinstance(per_batch_metrics[0][metric_name], np.ndarray):
                array_metrics.append(metric_name)

        if self.is_chief:
            combined_timeseries_type = Dict[str, List[List[Any]]]
            combined_timeseries = cast(combined_timeseries_type,
                                       combined_timeseries)
            num_batches = len(per_batch_metrics)
            num_processes = hvd.size()
            averaged_metrics_timeseries = {}  # type: Dict[str, List]

            for metric_name in combined_timeseries.keys():
                averaged_metrics_timeseries[metric_name] = []
                for batch_idx in range(num_batches):
                    batch = [
                        combined_timeseries[metric_name][process_idx]
                        [batch_idx] for process_idx in range(num_processes)
                    ]

                    np_batch = np.array(batch)
                    batch_avg = np.mean(
                        np_batch[np_batch != None])  # noqa: E711
                    if metric_name in array_metrics:
                        batch_avg = np.array(batch_avg)
                    averaged_metrics_timeseries[metric_name].append(batch_avg)
            per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries)
        return per_batch_metrics
Пример #22
0
 def _initialize_train_process_comm(self) -> None:
     check.true(self.hvd_config.use)
     if self.is_chief:
         logging.debug(f"Chief {hvd.rank()} setting up server with "
                       f"port {constants.INTER_TRAIN_PROCESS_COMM_PORT}.")
         self.train_process_comm_chief = ipc.ZMQServer(
             ports=[constants.INTER_TRAIN_PROCESS_COMM_PORT],
             num_connections=1)
     else:
         chief_ip_address = self.rendezvous_info.get_ip_addresses()[0]
         logging.debug(
             f"Non-Chief {hvd.rank()} setting up comm to "
             f"{chief_ip_address} w/ port {constants.INTER_TRAIN_PROCESS_COMM_PORT}."
         )
         self.train_process_comm_worker = ipc.ZMQClient(
             ip_address=chief_ip_address,
             port=constants.INTER_TRAIN_PROCESS_COMM_PORT)
Пример #23
0
def validate_batch_metrics(batch_metrics: List[Dict[str, Any]]) -> None:
    metric_dict = _list_to_dict(batch_metrics)

    # We expect that every batch has a metric named "loss".
    check.true(
        any(v for v in metric_dict if v.startswith("loss")),
        "model did not compute 'loss' training metric",
    )

    # We expect that all batches have the same set of metrics.
    metric_dict_keys = metric_dict.keys()
    for idx, metric_dict in zip(range(len(batch_metrics)), batch_metrics):
        keys = metric_dict.keys()
        if metric_dict_keys == keys:
            continue

        check.eq(metric_dict_keys, keys, "inconsistent training metrics: index: {}".format(idx))
Пример #24
0
    def _launch_horovodrun(self) -> subprocess.Popen:
        check.true(self.hvd_config.use)
        logging.debug(f"Starting training process on: {self.rendezvous_info.get_rank()}.")

        horovod_process_cmd = horovod.create_run_command(
            num_gpus_per_machine=self.num_gpus,
            ip_addresses=self.rendezvous_info.get_ip_addresses(),
            env=self.env,
            debug=self.env.experiment_config.debug_enabled(),
            optional_args=self.env.experiment_config.horovod_optional_args(),
            worker_process_env_path=self._worker_process_env_path,
        )
        subprocess_env = {
            **os.environ,
            "NCCL_DEBUG": "INFO",
            "DET_HOROVOD_GLOO_RENDEZVOUS_PORT": str(constants.HOROVOD_GLOO_RENDEZVOUS_PORT),
        }
        return subprocess.Popen(horovod_process_cmd, env=subprocess_env)
Пример #25
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, "ACTIVE")

    # Wait for the only trial to get scheduled.
    workload_active = False
    for _ in range(conf.MAX_TASK_SCHEDULED_SECS):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        workload_active,
        f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.",
    )

    # Wait for the only trial to show progress, indicating the image is built and running.
    num_steps = 0
    for _ in range(conf.MAX_TRIAL_BUILD_SECS):
        trials = exp.experiment_trials(experiment_id)
        if len(trials) > 0:
            only_trial = trials[0]
            num_steps = len(only_trial["steps"])
            if num_steps > 1:
                break
        time.sleep(1)
    check.true(
        num_steps > 1,
        f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.",
    )

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "PAUSED")

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active, "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
Пример #26
0
    def _init_run_config(
            self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig:
        logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .")

        session_config = config.session_config
        train_distribute = None
        eval_distribute = None
        if self.hvd_config.use:
            if session_config is None:
                session_config = tf.compat.v1.ConfigProto()
            session_config.gpu_options.allow_growth = True

            # If using CUDA_VISIBLE_DEVICES there is only one visible GPU
            # so there is no need to set visible devices for TF.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if not self.env.experiment_config.get("data", {}).get(
                    "set_cuda_visible_devices", False):
                session_config.gpu_options.visible_device_list = str(
                    self.env.slot_ids[horovod.hvd.local_rank()])
        elif len(self.env.container_gpus) > 1:
            check.true(len(self.rendezvous_info.get_addrs()) == 1)
            train_distribute = tf.distribute.MirroredStrategy()
            eval_distribute = tf.distribute.MirroredStrategy()

        config = config.replace(
            model_dir=str(self.estimator_dir),
            tf_random_seed=self.env.trial_seed,
            save_checkpoints_steps=None,
            # `train_and_evaluate()` requires that either
            # `save_checkpoints_steps` or `save_checkpoints_secs` is
            # set to greater than 0.
            save_checkpoints_secs=VERY_LARGE_NUMBER,
            session_config=session_config,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute,
            experimental_distribute=None,
        )
        logging.debug(f"Initialized RunConfig with args: {config}.")
        return config
Пример #27
0
        def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset:
            ds = f(*args, **kwargs)

            if self.context.experimental.get_train_cacheable().is_decorator_used():
                check.false(
                    self.context.dataset_initialized,
                    "Please do not use: `context.wrap_dataset(dataset)` if using "
                    "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` "
                    "and `@context.experimental.cache_validation_dataset(dataset_name, "
                    "dataset_version)`.",
                )
            else:
                check.true(
                    self.context.dataset_initialized,
                    "Please pass your datasets (train and test) into "
                    "`context.wrap_dataset(dataset)` right after creating them.",
                )

            if isinstance(ds, tf.data.Dataset):
                ds = ds.repeat()

            return ds
Пример #28
0
    def wrap_scaler(self, scaler: Any) -> Any:
        """
        Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned
        scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from
        vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should
        be called before clipping gradients, ``update`` should be called after stepping all
        optimizers, etc.

        PyTorch 1.6 or greater is required for this feature.

        Arguments:
            scaler (``torch.cuda.amp.GradScaler``):  Scaler to wrap and track.

        Returns:
            The scaler. It may be wrapped to add additional functionality for use in Determined.
        """

        check.false(
            amp_import_error,
            "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.")

        check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.")

        check.is_none(self._scaler,
                      "Please only call wrap_scaler or use_amp once.")

        check.true(
            len(self.models) == 0,
            "Please call wrap_scaler before wrap_model.")

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        self._scaler = scaler

        return scaler
Пример #29
0
def _full_storage_path(
    host_path: str,
    storage_path: Optional[str] = None,
    container_path: Optional[str] = None,
) -> str:
    """
    Return the full path to the storage_path, either as a subdirectory of the host_path in the
    host environment, where container_path must be None, or as a subdirectory of the container_path
    when in the container enviornment, where container_path must not be None.
    """
    check.true(os.path.isabs(host_path),
               "`host_path` must be an absolute path.")

    if storage_path is None:
        return host_path if container_path is None else container_path

    abs_path = os.path.normpath(os.path.join(host_path, storage_path))
    check.true(abs_path.startswith(host_path),
               "storage path must be a subdirectory of host path.")
    storage_path = os.path.relpath(abs_path, host_path)

    return os.path.join(
        host_path if container_path is None else container_path, storage_path)
Пример #30
0
    def _combine_metrics_across_processes(
        self, metrics: Dict[str, Any], num_batches: int
    ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]:
        # The chief receives the metric from every other training process.
        check.true(self.hvd_config.use)

        metrics_lists = {}  # type: Dict[str, Any]
        batches_per_process = []  # type: List[int]
        if self.is_chief:
            self.train_process_comm_chief = cast(
                ipc.ZMQBroadcastServer, self.train_process_comm_chief
            )
            worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(lambda: None)
            self.train_process_comm_chief.broadcast(None)
            worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics)

            for metric_name in metrics.keys():
                metrics_lists[metric_name] = [metrics[metric_name]]
                for worker_metric in worker_metrics:
                    metrics_lists[metric_name].append(worker_metric.metrics[metric_name])

            batches_per_process.append(num_batches)
            for worker_metric in worker_metrics:
                batches_per_process.append(worker_metric.num_batches)

            return metrics_lists, batches_per_process
        else:
            self.train_process_comm_worker = cast(
                ipc.ZMQBroadcastClient, self.train_process_comm_worker
            )
            self.train_process_comm_worker.send(
                ipc.MetricsInfo(metrics=metrics, num_batches=num_batches)
            )
            # Synchronize with the chief so that there is no risk of accidentally calling send()
            # for a future gather before all workers have called send() on this gather.
            _ = self.train_process_comm_worker.recv()
            return None, None