Exemplo n.º 1
0
    def wrap_optimizer(self, optimizer: Any) -> Any:
        """
        This should be used to wrap optimizer objects immediately after they have
        been created. Users should use the output of this wrapper as the new instance
        of their optimizer. For example, if users create their optimizer within
        ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)``
        prior to passing the optimizer into their Estimator.
        """
        if not self.env.training:
            return optimizer

        self.optimizer_initialized = True
        if not self.hvd_config.use:
            return optimizer

        check.check_false(
            isinstance(optimizer, str),
            "Please specify an optimizer object instead of using a string name.",
        )

        hvd.require_horovod_type(
            "tensorflow", "EstimatorContext.wrap_optimizer was called.")
        use_compression = self.hvd_config.fp16_compression
        optimizer = hvd.DistributedOptimizer(
            optimizer,
            compression=hvd.compression.Compression.fp16
            if use_compression else hvd.compression.Compression.none,
            aggregation_frequency=self.hvd_config.aggregation_frequency,
            average_aggregated_gradients=self.hvd_config.
            average_aggregated_gradients,
        )
        logging.debug(
            "Initialized optimizer for distributed and optimized parallel training."
        )
        return optimizer
Exemplo n.º 2
0
    def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g., one
        for training and one for testing), users should wrap each dataset
        independently. E.g., If users instantiate their training dataset within
        ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)``
        prior to passing it into ``tf.estimator.TrainSpec``.

        Args:
            dataset: tf.data.Dataset
            shard_dataset:
                When performing multi-slot (distributed) training, this
                controls whether the dataset is sharded so that each training process
                (one per slot) sees unique data. If set to False, users must manually
                configure each process to use unique data.

        """
        if not self.env.training:
            return dataset

        hvd.require_horovod_type("tensorflow",
                                 "EstimatorContext.wrap_dataset was called.")

        self.dataset_initialized = True
        if not self.hvd_config.use or self.input_from_dataflow or not shard_dataset:
            if self.hvd_config and not shard_dataset:
                logging.info("Dataset sharding skipped.")
            return dataset

        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
Exemplo n.º 3
0
    def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g.,
        one for training and one for validation), users should wrap each dataset
        independently.

        Args:
            dataset: tf.data.Dataset
            shard_dataset:
                When performing multi-slot (distributed) training, this
                controls whether the dataset is sharded so that each training process
                (one per slot) sees unique data. If set to False, users must manually
                configure each process to use unique data.
        """
        self.dataset_initialized = True
        if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset) or not shard_dataset:

            if self.hvd_config and not shard_dataset:
                logging.info("Dataset sharding skipped.")
            return dataset

        hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.")
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
Exemplo n.º 4
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("torch", "PyTorchTrial is in use.")
            hvd.init()

        PyTorchTrialController._set_random_seeds(env.trial_seed)
Exemplo n.º 5
0
    def wrap_optimizer(
        self, optimizer: tf.keras.optimizers.Optimizer
    ) -> tf.keras.optimizers.Optimizer:
        """
        This should be user to wrap ``tf.keras.optimizers.Optimizer`` objects. Users
        should use the output use the output of this wrapper as the new instance of
        their optimizer. If users create multiple optimizers, users should wrap each
        optimizer independently.

        Args:
            optimizer: tf.keras.optimizers.Optimizer
        """
        if not self.env.managed_training:
            return optimizer

        logging.debug(f"Processing wrapped optimizer {optimizer}.")
        if not self.hvd_config.use:
            self._wrapped_optimizers.append(optimizer)
            return optimizer

        hvd.require_horovod_type("tensorflow.keras",
                                 "TFKerasContext.wrap_optimizer was called.")
        if optimizer == self._compiled_optimizer:
            logging.debug(
                "Skipping wrapping optimizer as it was already wrapped during the compile call."
            )
            wrapped_optimizer = optimizer
        else:
            wrapped_optimizer = self._get_horovod_optimizer_if_using_horovod(
                optimizer=optimizer, )
        self._wrapped_optimizers.append(wrapped_optimizer)

        return wrapped_optimizer
Exemplo n.º 6
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem. This default session does not have any effect within the Estimator itself.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config, session_config=None
        )

        logging.debug("Applying tf.estimator patches.")

        @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate")
        def patch_estimator_eval_on_checkpoint(original, *args, **kwargs):  # type: ignore
            # With a single worker and multiple devices,
            # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if
            # `input_fn` or `steps` is None, which causes an error when evaluating the
            # model function. Apply a monkey-patch to skip the internal function that
            # ultimately runs the evaluation.
            logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
Exemplo n.º 7
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config)
Exemplo n.º 8
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow",
                                     "TensorpackTrial is in use.")
            hvd.init()

        TensorpackTrialController._set_random_seeds()
Exemplo n.º 9
0
    def pre_execute_hook(
        cls: Type["TFKerasTrialController"],
        env: det.EnvContext,
        distributed_backend: det._DistributedBackend,
    ) -> None:
        # Initialize the correct horovod.
        if distributed_backend.use_horovod():
            hvd.require_horovod_type("tensorflow.keras", "TFKerasTrial is in use.")
            hvd.init()

        # Start with a clean graph.
        tf.compat.v1.reset_default_graph()

        cls._set_random_seeds(env.trial_seed)
Exemplo n.º 10
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow.keras", "TFKerasTrial is in use.")
            hvd.init()

        # Start with a clean graph.
        tf.compat.v1.reset_default_graph()

        TFKerasTrialController._set_random_seeds(env.trial_seed)

        # For the Native API we must configure the Session before running user code.
        if env.experiment_config.native_enabled():
            session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
            TFKerasTrialController._configure_session(env, hvd_config, session_config)
    def pre_execute_hook(
        cls: Type["PyTorchTrialController"],
        env: det.EnvContext,
        distributed_backend: det._DistributedBackend,
    ) -> None:
        # Initialize the correct horovod.
        if distributed_backend.use_horovod():
            hvd.require_horovod_type("torch", "PyTorchTrial is in use.")
            hvd.init()
        if distributed_backend.use_torch():
            if torch.cuda.is_available():
                dist.init_process_group(backend="nccl")  # type: ignore
            else:
                dist.init_process_group(backend="gloo")  # type: ignore

        cls._set_random_seeds(env.trial_seed)
Exemplo n.º 12
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem. This default session does not have any effect within the Estimator itself.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config, session_config=None)

        logging.debug("Applying tf.estimator patches.")

        @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate,
                                             "_evaluate")
        def patch_estimator_eval_on_checkpoint(original, *args,
                                               **kwargs):  # type: ignore
            # With a single worker and multiple devices,
            # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if
            # `input_fn` or `steps` is None, which causes an error when evaluating the
            # model function. Apply a monkey-patch to skip the internal function that
            # ultimately runs the evaluation.
            logging.info("Skipping %s(*%s, **%s)", original.__name__, args,
                         kwargs)
Exemplo n.º 13
0
    def wrap_optimizer(self, optimizer: Any) -> Any:
        """
        This should be used to wrap optimizer objects immediately after they have
        been created. Users should use the output of this wrapper as the new instance
        of their optimizer. For example, if users create their optimizer within
        ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)``
        prior to passing the optimizer into their Estimator.
        """
        if not self.env.managed_training:
            return optimizer

        self.optimizer_initialized = True
        if not self.hvd_config.use:
            return optimizer

        check.check_false(
            isinstance(optimizer, str),
            "Please specify an optimizer object instead of using a string name.",
        )

        hvd.require_horovod_type(
            "tensorflow", "EstimatorContext.wrap_optimizer was called.")
        use_compression = self.hvd_config.fp16_compression

        # The signature of our horovod optimizer changed after we rebased onto 0.21.
        hvd_sig = inspect.signature(hvd.DistributedOptimizer)
        horovod_kwargs = {
            "compression":
            hvd.compression.Compression.fp16
            if use_compression else hvd.compression.Compression.none,
            "average_aggregated_gradients":
            self.hvd_config.average_aggregated_gradients,
        }
        if "aggregation_frequency" in hvd_sig.parameters:
            horovod_kwargs[
                "aggregation_frequency"] = self.hvd_config.aggregation_frequency
        else:
            horovod_kwargs[
                "backward_passes_per_step"] = self.hvd_config.aggregation_frequency

        optimizer = hvd.DistributedOptimizer(optimizer, **horovod_kwargs)
        logging.debug(
            "Initialized optimizer for distributed and optimized parallel training."
        )
        return optimizer
Exemplo n.º 14
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

        # Initialize random seeds.
        if env.experiment_config.input_from_dataflow():
            logging.debug("Using tensorpack dataflows as input.")
            process_rank = 0 if not hvd_config.use else hvd.rank()
            EstimatorTrialController.set_random_seed(env.trial_seed + process_rank)
        else:
            # Set identical random seeds on all training processes.
            # When using horovod, each worker will receive a unique
            # shard of the dataset.
            EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()
Exemplo n.º 15
0
    def wrap_dataset(self, dataset: Any) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g., one
        for training and one for testing), users should wrap each dataset
        independently. E.g., If users instantiate their training dataset within
        ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)``
        prior to passing it into ``tf.estimator.TrainSpec``.
        """
        hvd.require_horovod_type("tensorflow",
                                 "EstimatorContext.wrap_dataset was called.")

        self.dataset_initialized = True
        if not self.hvd_config.use or self.input_from_dataflow:
            return dataset
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
Exemplo n.º 16
0
    def wrap_dataset(self, dataset: Any) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g.,
        one for training and one for testing), users should wrap each dataset
        independently.

        Args:
            dataset: tf.data.Dataset
        """
        self.dataset_initialized = True
        if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset):
            return dataset

        hvd.require_horovod_type("tensorflow.keras",
                                 "TFKerasContext.wrap_dataset was called.")
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset