def wrap_optimizer( self, optimizer: torch.optim.Optimizer # type: ignore ) -> torch.optim.Optimizer: # type: ignore """Returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`wrap_model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. """ if self.env.training: check.false( self._use_amp, "Must call wrap_optimizer() before configure_apex_amp.") if self.hvd_config.use: use_compression = self.hvd_config.fp16_compression optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=self.hvd_config. aggregation_frequency, compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def _launch_fit(self) -> None: check.false(self.fit_loop_started) self.fit_loop_started = True self.tf_keras_callbacks.append(DeterminedEarlyStoppingCallback(self)) self.tf_keras_callbacks.append(WaitForInstructionsCallback(self)) profile_frequency = self.env.experiment_config.profile_frequency() if profile_frequency: self.tf_keras_callbacks.append( DeterminedProfiler(profile_frequency, DeterminedProfiler.OUTPUT_FILENAME)) if self.hvd_config.use: # When using horovod broadcast initial variable states from rank 0 to # all other processes. self.tf_keras_callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) ( training_input, batches_per_epoch, ) = self._train_input_manager.get_training_input_and_batches_per_epoch( ) _ = self.model.fit( training_input, callbacks=self.tf_keras_callbacks, shuffle=False, steps_per_epoch=batches_per_epoch, initial_epoch=self._train_input_manager.get_initial_epoch(), epochs=IMPOSSIBLY_LARGE_EPOCHS, validation_split=0, verbose=0, ).history
def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module: """Returns a wrapped model.""" if self.env.managed_training: check.false(self._use_amp, "Must call wrap_model() before configure_apex_amp.") model = model.to(self.device) if not self.hvd_config.use and self.n_gpus > 1: check.eq( self.hvd_config.aggregation_frequency, 1, "Please enable `optimized_parallel` to use aggregation " "frequency greater than 1 for single machine multi-GPU " "training.", ) model = nn.DataParallel(model) logging.debug( "Initialized model for native parallel training.") model_id = len(self.models) self._main_model.__setattr__(f"model_{model_id}", model) self.models.append(model) return model
def _configure_amp(self) -> None: if self.use_amp(): if self.hvd_config.use: check.eq( self.hvd_config.aggregation_frequency, 1, "Mixed precision training (AMP) is not supported with " "aggregation frequency > 1.", ) check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) check.false( not self.hvd_config.use and self.n_gpus > 1, "To enable mixed precision training (AMP) for parallel training, " 'please set `resources["optimized_parallel"] = True`.', ) logging.info( f"Enabling mixed precision training with opt_level: {self._get_amp_setting()}." ) self.context.model, self.context.optimizer = apex.amp.initialize( self.context.model, self.context.optimizer, opt_level=self._get_amp_setting(), verbosity=1 if self.is_chief or self.env.experiment_config.debug_enabled() else 0, )
def __init__( self, context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext], train_config: keras.TFKerasTrainConfig, ) -> None: super().__init__(context=context) self._training_cacheable = self._context.experimental.get_train_cacheable( ) self._training_dataset = train_config.training_data check.true( self._training_cacheable.is_decorator_used(), "Please use `@context.experimental.cache_train_dataset(dataset_name, dataset_version)`" " for the training dataset.", ) check.false( self._context.dataset_initialized, "Please do not use: `context.wrap_dataset(dataset)` if using " "`@context.experimental.cache_train_dataset()` and " "`@context.experimental.cache_validation_dataset()`.", ) check.is_instance( train_config.training_data, tf.data.Dataset, "Pass in a `tf.data.Dataset` object if using " "`@context.experimental.cache_train_dataset()`.", )
def wrap_optimizer( self, optimizer: torch.optim.Optimizer, # type: ignore backward_passes_per_step: int = 1, ) -> torch.optim.Optimizer: # type: ignore """Returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`wrap_model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. `backward_passes_per_step` can be used to specify how many gradient aggregation steps will be performed in a single `train_batch` call per optimizer step. In most cases, this will just be the default value 1. However, this advanced functionality can be used to support training loops like the one shown below: .. code-block:: python def train_batch( self, batch: TorchData, epoch_idx: int, batch_idx: int ) -> Dict[str, torch.Tensor]: data, labels = batch output = self.model(data) loss1 = output['loss1'] loss2 = output['loss2'] self.context.backward(loss1) self.context.backward(loss2) self.context.step_optimizer(self.optimizer, backward_passes_per_step=2) return {"loss1": loss1, "loss2": loss2} """ if self.env.managed_training: check.false( self._use_amp, "Must call wrap_optimizer() before configure_apex_amp.") check.gt_eq( backward_passes_per_step, 1, "backwar_passes_per_step for local gradient aggregation must be >= 1", ) if self.hvd_config.use: use_compression = self.hvd_config.fp16_compression optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=backward_passes_per_step * self.hvd_config.aggregation_frequency, compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def _clip_grads(self, parameters: Any) -> None: # TODO: Support clip by norm other than L2. clip_grad_l2_norm = self.env.hparams.get("clip_grad_l2_norm", None) clip_by_val = self.env.hparams.get("clip_grad_val", None) check.false( clip_grad_l2_norm is not None and clip_by_val is not None, "Please specify either `clip_grad_l2_norm` or `clip_by_val` " "in your hparams, not both.", ) if clip_grad_l2_norm is not None: logging.debug( f"Clipping gradients by L2 norm of: {clip_grad_l2_norm}.") torch.nn.utils.clip_grad_norm_(parameters, clip_grad_l2_norm) # type: ignore elif clip_by_val is not None: logging.debug(f"Clipping gradients by value of: {clip_by_val}.") torch.nn.utils.clip_grad_value_(parameters, clip_by_val) # type: ignore else: logging.debug("No gradient clipping enabled.")
def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset: ds = f(*args, **kwargs) if self.context.experimental.get_train_cacheable().is_decorator_used(): check.false( self.context.dataset_initialized, "Please do not use: `context.wrap_dataset(dataset)` if using " "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` " "and `@context.experimental.cache_validation_dataset(dataset_name, " "dataset_version)`.", ) else: check.true( self.context.dataset_initialized, "Please pass your datasets (train and test) into " "`context.wrap_dataset(dataset)` right after creating them.", ) if isinstance(ds, tf.data.Dataset): ds = ds.repeat() return ds
def wrap_scaler(self, scaler: Any) -> Any: """ Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should be called before clipping gradients, ``update`` should be called after stepping all optimizers, etc. PyTorch 1.6 or greater is required for this feature. Arguments: scaler (``torch.cuda.amp.GradScaler``): Scaler to wrap and track. Returns: The scaler. It may be wrapped to add additional functionality for use in Determined. """ check.false( amp_import_error, "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.") check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.") check.is_none(self._scaler, "Please only call wrap_scaler or use_amp once.") check.true( len(self.models) == 0, "Please call wrap_scaler before wrap_model.") check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) self._scaler = scaler return scaler
def configure_apex_amp( self, models: Union[torch.nn.Module, List[torch.nn.Module]], optimizers: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]], enabled: Optional[bool] = True, opt_level: Optional[str] = "O1", cast_model_type: Optional[torch.dtype] = None, patch_torch_functions: Optional[bool] = None, keep_batchnorm_fp32: Optional[Union[bool, str]] = None, master_weights: Optional[bool] = None, loss_scale: Optional[Union[float, str]] = None, cast_model_outputs: Optional[torch.dtype] = None, num_losses: Optional[int] = 1, verbosity: Optional[int] = 1, min_loss_scale: Optional[float] = None, max_loss_scale: Optional[float] = 2.0 ** 24, ) -> Tuple: """ Configure automatic mixed precision for your models and optimizers. Note that details for apex.amp are handled automatically within Determined after this call. This function must be called **after** you have finished constructing your models and optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`. This function has the same arguments as `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_. .. warning:: When using distributed training and automatic mixed precision, we only support ``num_losses=1`` and calling backward on the loss once. Arguments: models (``torch.nn.Module`` or list of ``torch.nn.Module`` s): Model(s) to modify/cast. optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s): Optimizers to modify/cast. REQUIRED for training. enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script should run as if Amp were not present. opt_level (str, optional, default="O1"): Pure or mixed precision optimization level. Accepted values are "O0", "O1", "O2", and "O3", explained in detail above. cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see above. patch_torch_functions (bool, optional, default=None): Optional property override. keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If passed as a string, must be the string "True" or "False". master_weights (bool, optional, default=None): Optional property override. loss_scale (float or str, optional, default=None): Optional property override. If passed as a string, must be a string representing a number, e.g., "128.0", or the string "dynamic". cast_model_outputs (torch.dtype, optional, default=None): Option to ensure that the outputs of your model is always cast to a particular type regardless of ``opt_level``. num_losses (int, optional, default=1): Option to tell Amp in advance how many losses/backward passes you plan to use. When used in conjunction with the ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass, which can improve stability. If ``num_losses`` is left to 1, Amp will still support multiple losses/backward passes, but use a single global loss scale for all of them. verbosity (int, default=1): Set to 0 to suppress Amp-related output. min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored. max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored. Returns: Model(s) and optimizer(s) modified according to the ``opt_level``. If ``optimizers`` args were lists, the corresponding return value will also be a list. """ if not self.env.managed_training: return models, optimizers check.false(self._use_amp, "Please only call configure_apex_amp once.") if self.hvd_config.use: check.eq( num_losses, 1, "When using parallel/distributed training, " "Determined only supports configure_apex_amp with num_losses = 1", ) self._use_amp = True if self.hvd_config.use: check.eq( self.hvd_config.aggregation_frequency, 1, "Mixed precision training (AMP) is not supported with " "aggregation frequency > 1.", ) check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) logging.info(f"Enabling mixed precision training with opt_level: {opt_level}.") models, optimizers = apex.amp.initialize( models=models, optimizers=optimizers, enabled=enabled, opt_level=opt_level, cast_model_type=cast_model_type, patch_torch_functions=patch_torch_functions, keep_batchnorm_fp32=keep_batchnorm_fp32, master_weights=master_weights, loss_scale=loss_scale, cast_model_outputs=cast_model_outputs, num_losses=num_losses, min_loss_scale=min_loss_scale, max_loss_scale=max_loss_scale, verbosity=verbosity if self.distributed.get_rank() == 0 or self.env.experiment_config.debug_enabled() else 0, ) if not isinstance(models, list): self.models = [models] if not isinstance(optimizers, list): self.optimizers = [optimizers] return models, optimizers
def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response: check.gt(step_id, 0) # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. self.context.model.train() for callback in self.callbacks.values(): callback.on_train_step_start(step_id) step_idx = step_id - 1 start = step_idx * batches_per_step end = start + batches_per_step per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self._to_device(batch) # Forward pass. tr_metrics = self.trial.train_batch( batch=batch, model=self.context.model, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " "mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Backwards pass. loss = tr_metrics["loss"] communicate_and_update = (batch_idx + 1) % self.hvd_config.aggregation_frequency == 0 if self.use_amp(): with apex.amp.scale_loss(loss, self.context.optimizer) as scaled_loss: scaled_loss.backward() if self.hvd_config.use and communicate_and_update: # When using horovod, we need to finish communicating gradient # updates before they are unscaled which happens when we exit # of this context manager. self.context.optimizer.synchronize() else: loss.backward() # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if communicate_and_update and self.hvd_config.use: self.context.optimizer.synchronize() if communicate_and_update: parameters = ( self.context.model.parameters() if not self.use_amp() else apex.amp.master_params(self.context.optimizer) ) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency ) # TODO: Remove this check in v0.12.8. check.false( self.env.hparams.get("clip_grad_l2_norm", None) or self.env.hparams.get("clip_grad_val", None), "Please specify gradient clipping via callbacks.", ) for callback in self.callbacks.values(): callback.on_before_optimizer_step(parameters) if self.hvd_config.use: with self.context.optimizer.skip_synchronize(): self.context.optimizer.step() else: self.context.optimizer.step() self.context.optimizer.zero_grad() # Step learning rate of a LRScheduler. if self.context.lr_scheduler is not None: self._auto_step_lr_scheduler_per_batch(batch_idx, self.context.lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics(per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) for callback in self.callbacks.values(): callback.on_train_step_end(step_id, metrics) if not self.is_chief: return workload.Skipped() logging.debug(f"Done training step: {num_inputs} records in {batches_per_step} batches.") return metrics