示例#1
0
    def wrap_optimizer(self, optimizer: Any) -> Any:
        """
        This should be used to wrap optimizer objects immediately after they have
        been created. Users should use the output of this wrapper as the new instance
        of their optimizer. For example, if users create their optimizer within
        ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)``
        prior to passing the optimizer into their Estimator.
        """
        if not self.env.training:
            return optimizer

        self.optimizer_initialized = True
        if not self.hvd_config.use:
            return optimizer

        check.check_false(
            isinstance(optimizer, str),
            "Please specify an optimizer object instead of using a string name.",
        )

        hvd.require_horovod_type(
            "tensorflow", "EstimatorContext.wrap_optimizer was called.")
        use_compression = self.hvd_config.fp16_compression
        optimizer = hvd.DistributedOptimizer(
            optimizer,
            compression=hvd.compression.Compression.fp16
            if use_compression else hvd.compression.Compression.none,
            aggregation_frequency=self.hvd_config.aggregation_frequency,
            average_aggregated_gradients=self.hvd_config.
            average_aggregated_gradients,
        )
        logging.debug(
            "Initialized optimizer for distributed and optimized parallel training."
        )
        return optimizer
示例#2
0
def _get_multi_gpu_model_and_optimizer(
    pre_compiled_model: tf.keras.Model,
    optimizer: tf.keras.optimizers.Optimizer,
    env: det.EnvContext,
    hvd_config: horovod.HorovodContext,
    profile_frequency: Optional[int],
    profile_filename: str,
) -> Tuple[tf.keras.Model, tf.keras.optimizers.Optimizer]:
    num_gpus = len(env.container_gpus)
    new_model = pre_compiled_model
    new_optimizer = optimizer
    if num_gpus > 1 and not hvd_config.use:
        new_model = tf.keras.utils.multi_gpu_model(pre_compiled_model,
                                                   num_gpus)
    # If using horovod, wrap the optimizer and check for an aggregation_frequency.
    elif hvd_config.use:
        # Horovod doesn't know how to handle string-based optimizers.
        if isinstance(optimizer, str):
            raise det.errors.InvalidExperimentException(
                "string optimizers are not supported")

        new_optimizer = hvd.DistributedOptimizer(
            optimizer,
            **get_horovod_config(
                exp_config=env.experiment_config,
                hvd_config=hvd_config,
                profile_frequency=profile_frequency,
                profile_filename=profile_filename,
            ),
        )
    return new_model, new_optimizer
示例#3
0
    def _init_model_and_optimizer(self) -> None:
        self.context.model = self.trial.build_model()

        # TODO: Check that optimizer is not an amp optimizer.
        self.context.optimizer = self.trial.optimizer(self.context.model)

        self._init_device()
        self.context.model = self.context.model.to(self.device)

        if self.hvd_config.use:
            use_compression = self.hvd_config.fp16_compression
            self.context.optimizer = hvd.DistributedOptimizer(
                self.context.optimizer,
                named_parameters=self.context.model.named_parameters(),
                backward_passes_per_step=self.hvd_config.aggregation_frequency,
                compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none,
            )
            logging.debug("Initialized optimizer for distributed and optimized parallel training.")
        elif self.n_gpus > 1:
            check.eq(
                self.hvd_config.aggregation_frequency,
                1,
                "Please enable `optimized_parallel` to use aggregation "
                "frequency greater than 1 for single machine multi-GPU "
                "training.",
            )
            self.context.model = nn.DataParallel(self.context.model)
            logging.debug("Initialized mode for native parallel training.")
示例#4
0
    def _Optimizer(
        self, optimizer: torch.optim.Optimizer
    ) -> torch.optim.Optimizer:  # type: ignore
        """Wraps an optimizer. It returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`Model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.
        """

        check.false(self._use_amp,
                    "Must call Optimizer() before _configure_apex_amp.")

        if self.hvd_config.use:
            use_compression = self.hvd_config.fp16_compression
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self._filter_named_parameters(optimizer),
                backward_passes_per_step=self.hvd_config.aggregation_frequency,
                compression=hvd.Compression.fp16
                if use_compression else hvd.Compression.none,
            )
            logging.debug(
                "Initialized optimizer for distributed and optimized parallel training."
            )

        self.optimizers.append(optimizer)
        return optimizer
示例#5
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        backward_passes_per_step: int = 1,
    ) -> torch.optim.Optimizer:
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.

        `backward_passes_per_step` can be used to specify how many gradient aggregation
        steps will be performed in a single `train_batch` call per optimizer step.
        In most cases, this will just be the default value 1.  However, this advanced functionality
        can be used to support training loops like the one shown below:

        .. code-block:: python

            def train_batch(
                self, batch: TorchData, epoch_idx: int, batch_idx: int
            ) -> Dict[str, torch.Tensor]:
                data, labels = batch
                output = self.model(data)
                loss1 = output['loss1']
                loss2 = output['loss2']
                self.context.backward(loss1)
                self.context.backward(loss2)
                self.context.step_optimizer(self.optimizer, backward_passes_per_step=2)
                return {"loss1": loss1, "loss2": loss2}

        """
        if self.env.managed_training:
            check.false(
                self._use_apex,
                "Must call wrap_optimizer() before configure_apex_amp.")
            check.gt_eq(
                backward_passes_per_step,
                1,
                "backward_passes_per_step for local gradient aggregation must be >= 1",
            )

            if self.distributed.size > 1 and self._distributed_backend.use_horovod(
            ):
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=backward_passes_per_step *
                    self._aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if self._fp16_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
示例#6
0
    def _get_horovod_optimizer_if_using_horovod(
        self, optimizer: tf.keras.optimizers.Optimizer
    ) -> tf.keras.optimizers.Optimizer:
        if not self.hvd_config.use:
            return optimizer

        # Horovod doesn't know how to handle string-based optimizers.
        if isinstance(optimizer, str):
            raise det.errors.InvalidExperimentException("string optimizers are not supported")

        return hvd.DistributedOptimizer(
            optimizer,
            aggregation_frequency=self.hvd_config.aggregation_frequency,
            average_aggregated_gradients=self.hvd_config.average_aggregated_gradients,
        )
示例#7
0
    def wrap_optimizer(self, optimizer: Any) -> Any:
        """
        This should be used to wrap optimizer objects immediately after they have
        been created. Users should use the output of this wrapper as the new instance
        of their optimizer. For example, if users create their optimizer within
        ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)``
        prior to passing the optimizer into their Estimator.
        """
        if not self.env.managed_training:
            return optimizer

        self.optimizer_initialized = True
        if not self.hvd_config.use:
            return optimizer

        check.check_false(
            isinstance(optimizer, str),
            "Please specify an optimizer object instead of using a string name.",
        )

        hvd.require_horovod_type(
            "tensorflow", "EstimatorContext.wrap_optimizer was called.")
        use_compression = self.hvd_config.fp16_compression

        # The signature of our horovod optimizer changed after we rebased onto 0.21.
        hvd_sig = inspect.signature(hvd.DistributedOptimizer)
        horovod_kwargs = {
            "compression":
            hvd.compression.Compression.fp16
            if use_compression else hvd.compression.Compression.none,
            "average_aggregated_gradients":
            self.hvd_config.average_aggregated_gradients,
        }
        if "aggregation_frequency" in hvd_sig.parameters:
            horovod_kwargs[
                "aggregation_frequency"] = self.hvd_config.aggregation_frequency
        else:
            horovod_kwargs[
                "backward_passes_per_step"] = self.hvd_config.aggregation_frequency

        optimizer = hvd.DistributedOptimizer(optimizer, **horovod_kwargs)
        logging.debug(
            "Initialized optimizer for distributed and optimized parallel training."
        )
        return optimizer
示例#8
0
    def _init_model(self) -> None:
        self.optimizer = self.trial.optimizer(self.model)
        # TODO: Check that optimizer is not an amp optimizer.

        self._init_device()
        self.model = self.model.to(self.device)

        if self.hvd_config.use:
            use_compression = self.hvd_config.fp16_compression
            self.optimizer = hvd.DistributedOptimizer(
                self.optimizer,
                named_parameters=self.model.named_parameters(),
                backward_passes_per_step=self.hvd_config.aggregation_frequency,
                compression=hvd.Compression.fp16
                if use_compression else hvd.Compression.none,
            )
            logging.debug(
                "Initialized optimizer for distributed and optimized parallel training."
            )
        elif self.n_gpus > 1:
            check.eq(
                self.hvd_config.aggregation_frequency,
                1,
                "Please enable `optimized_parallel` to use aggregation "
                "frequency greater than 1 for single machine multi-GPU "
                "training.",
            )
            self.model = nn.DataParallel(self.model)
            logging.debug("Initialized mode for native parallel training.")

        self.lr_helper = _LRHelper(
            self.trial.create_lr_scheduler(self.optimizer))

        # If a load path is provided load weights and restore the data location.
        self._load()

        self._configure_amp()

        if self.hvd_config.use:
            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(self.optimizer, root_rank=0)

        # Initialize training and validation iterators.
        self.training_iterator = iter(self.training_loader)
示例#9
0
    def _get_horovod_optimizer_if_using_horovod(
        self, optimizer: tf.keras.optimizers.Optimizer
    ) -> tf.keras.optimizers.Optimizer:
        if not self.hvd_config.use:
            return optimizer

        # Horovod doesn't know how to handle string-based optimizers.
        if isinstance(optimizer, str):
            raise det.errors.InvalidExperimentException("string optimizers are not supported")

        # The signature of our horovod optimizer changed after we rebased onto 0.21.
        hvd_sig = inspect.signature(hvd.DistributedOptimizer)
        horovod_kwargs = {
            "average_aggregated_gradients": self.hvd_config.average_aggregated_gradients,
        }  # type: Dict[str, Any]
        if "aggregation_frequency" in hvd_sig.parameters:
            horovod_kwargs["aggregation_frequency"] = self.hvd_config.aggregation_frequency
        else:
            horovod_kwargs["backward_passes_per_step"] = self.hvd_config.aggregation_frequency

        return hvd.DistributedOptimizer(optimizer, **horovod_kwargs)