def require_horovod_type(self, horovod_type: str, reason: str) -> None: """ Declare the required type of horovod and give a unique reason as to why it is required. The reason makes for clear error reporting if require_horovod_type() is called a second time but with a different type. """ known_types = {"tensorflow", "tensorflow.keras", "torch"} check.is_in(horovod_type, known_types, "Unknown horovod type requested.") if self._poly_hvd_type is not None: check.eq( horovod_type, self._poly_hvd_type, f"require_horovod_type() called with with type {horovod_type} after a previous " f"call with type {self._poly_hvd_type} in the same process. The reason for the " f"first call was '{self._poly_hvd_first_reason}'; the reason for this call is " f"'{reason}'.", ) else: self._poly_hvd_type = horovod_type self._poly_hvd_first_reason = reason # If horovod has not been imported yet, do it now. try: self._poly_hvd_module = importlib.import_module( f"horovod.{horovod_type}") except ImportError: pass
def from_config(cls, config: Dict[str, Any], container_path: Optional[str]) -> "StorageManager": allowed_keys = { "host_path", "storage_path", "container_path", "propagation" } for key in config.keys(): check.is_in(key, allowed_keys, "extra key in shared_fs config") check.is_in("host_path", config, "shared_fs config is missing host_path") # Ignore legacy configuration values propagation and container_path. base_path = _full_storage_path(config["host_path"], config.get("storage_path"), container_path) return cls(base_path)
def binary_error_rate(predictions: torch.Tensor, labels: torch.Tensor) -> float: """Return the classification error rate for binary classification.""" check.eq(predictions.shape[0], labels.shape[0]) check.is_in(len(predictions.shape), [1, 2]) if len(predictions.shape) == 2: check.eq(predictions.shape[1], 1) check.len_eq(labels.shape, 1, "Labels must be a column vector") if len(predictions.shape) > 1: predictions = torch.squeeze(predictions) errors = torch.sum( labels.to(torch.long) != torch.round(predictions).to(torch.long)) result = float(errors) / predictions.shape[0] # type: float return result
def wrap_lr_scheduler( self, lr_scheduler: torch.optim.lr_scheduler._LRScheduler, step_mode: pytorch.LRScheduler.StepMode, frequency: int = 1, ) -> torch.optim.lr_scheduler._LRScheduler: """ Returns a wrapped LR scheduler. The LR scheduler must use an optimizer wrapped by :meth:`wrap_optimizer`. If ``apex.amp`` is in use, the optimizer must also have been configured with :meth:`configure_apex_amp`. """ if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): if step_mode != pytorch.LRScheduler.StepMode.MANUAL_STEP: raise det.errors.InvalidExperimentException( "detected that context.wrap_lr_scheduler() was called with an instance of " "torch.optim.lr_scheduer.ReduceLROnPlateau as the lr_scheduler. This lr " "scheduler class does not have the usual step() parameters, and so it can " "only be used with step_mode=MANUAL_STEP.\n" "\n" "For example, if you wanted to step it on every validation step, you might " "wrap your lr_scheduler and pass it to a callback like this:\n" "\n" "class MyLRStepper(PyTorchCallback):\n" " def __init__(self, wrapped_lr_scheduler):\n" " self.wrapped_lr_scheduler = wrapped_lr_scheduler\n" "\n" " def on_validation_end(self, metrics):\n" ' self.wrapped_lr_scheduler.step(metrics["validation_error"])\n' ) opt = getattr(lr_scheduler, "optimizer", None) if opt is not None: check.is_in( opt, self.optimizers, "Must use an optimizer that is returned by wrap_optimizer()", ) wrapped = pytorch.LRScheduler(lr_scheduler, step_mode, frequency) self.lr_schedulers.append(wrapped) # Return the original LR scheduler to the user in case they have customizations that we # don't care about. return lr_scheduler
def from_configs( experiment_config: ExperimentConfig, rendezvous_info: RendezvousInfo, hparams: Dict[str, Any], ) -> "HorovodContext": """ Create the HorovodContext according to experiment config and rendezvous info for this trial. """ # Horovod is always used for multi-machine distributed training. For # single-machine multi-GPU training, Horovod is used when native_parallel is # disabled. multi_machine_trial = rendezvous_info.get_size() > 1 multi_slot_trial = experiment_config["resources"]["slots_per_trial"] > 1 use_horovod = multi_machine_trial or ( multi_slot_trial and not experiment_config.native_parallel_enabled()) check.is_in("optimizations", experiment_config) optimizations_config = cast(Dict[str, Any], experiment_config.get("optimizations")) check.is_in("aggregation_frequency", optimizations_config) check.is_in("gradient_compression", optimizations_config) check.is_in("average_training_metrics", optimizations_config) # Help users migrate from the old locations for these settings, in hparams. def error_message_removed_from_hparams(removed_hparam: str) -> str: return ( f"Please move `{removed_hparam}` in the experiment config to " f"`Optimizations` from `hyperparameters`.") check.not_in( "aggregation_frequency", hparams, error_message_removed_from_hparams("aggregation_frequency"), ) check.not_in( "gradient_compression", hparams, error_message_removed_from_hparams("gradient_compression"), ) check.not_in( "grad_updates_size_file", hparams, error_message_removed_from_hparams("grad_updates_size_file"), ) hvd_config = HorovodContext( use=use_horovod, aggregation_frequency=cast( int, optimizations_config.get("aggregation_frequency")), fp16_compression=cast( bool, optimizations_config.get("gradient_compression")), grad_updates_size_file=optimizations_config.get( "grad_updates_size_file", None), average_aggregated_gradients=cast( bool, optimizations_config.get("average_aggregated_gradients")), average_training_metrics=cast( bool, optimizations_config.get("average_training_metrics")), ) if hvd_config.use and hvd_config.aggregation_frequency > 1: logging.info( f"Setting `aggregation_frequency` to {hvd_config.aggregation_frequency} " "to optimize training.") if hvd_config.use and hvd_config.fp16_compression: logging.info( "Enabling `gradient_compression` to optimize training.") return hvd_config