def wrap_optimizer(self, optimizer: Any) -> Any: """ This should be used to wrap optimizer objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their optimizer. For example, if users create their optimizer within ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)`` prior to passing the optimizer into their Estimator. """ if not self.env.training: return optimizer self.optimizer_initialized = True if not self.hvd_config.use: return optimizer check.check_false( isinstance(optimizer, str), "Please specify an optimizer object instead of using a string name.", ) hvd.require_horovod_type( "tensorflow", "EstimatorContext.wrap_optimizer was called.") use_compression = self.hvd_config.fp16_compression optimizer = hvd.DistributedOptimizer( optimizer, compression=hvd.compression.Compression.fp16 if use_compression else hvd.compression.Compression.none, aggregation_frequency=self.hvd_config.aggregation_frequency, average_aggregated_gradients=self.hvd_config. average_aggregated_gradients, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) return optimizer
def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. E.g., If users instantiate their training dataset within ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)`` prior to passing it into ``tf.estimator.TrainSpec``. Args: dataset: tf.data.Dataset shard_dataset: When performing multi-slot (distributed) training, this controls whether the dataset is sharded so that each training process (one per slot) sees unique data. If set to False, users must manually configure each process to use unique data. """ if not self.env.training: return dataset hvd.require_horovod_type("tensorflow", "EstimatorContext.wrap_dataset was called.") self.dataset_initialized = True if not self.hvd_config.use or self.input_from_dataflow or not shard_dataset: if self.hvd_config and not shard_dataset: logging.info("Dataset sharding skipped.") return dataset dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for validation), users should wrap each dataset independently. Args: dataset: tf.data.Dataset shard_dataset: When performing multi-slot (distributed) training, this controls whether the dataset is sharded so that each training process (one per slot) sees unique data. If set to False, users must manually configure each process to use unique data. """ self.dataset_initialized = True if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset) or not shard_dataset: if self.hvd_config and not shard_dataset: logging.info("Dataset sharding skipped.") return dataset hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.") dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug(f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("torch", "PyTorchTrial is in use.") hvd.init() PyTorchTrialController._set_random_seeds(env.trial_seed)
def wrap_optimizer( self, optimizer: tf.keras.optimizers.Optimizer ) -> tf.keras.optimizers.Optimizer: """ This should be user to wrap ``tf.keras.optimizers.Optimizer`` objects. Users should use the output use the output of this wrapper as the new instance of their optimizer. If users create multiple optimizers, users should wrap each optimizer independently. Args: optimizer: tf.keras.optimizers.Optimizer """ if not self.env.managed_training: return optimizer logging.debug(f"Processing wrapped optimizer {optimizer}.") if not self.hvd_config.use: self._wrapped_optimizers.append(optimizer) return optimizer hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_optimizer was called.") if optimizer == self._compiled_optimizer: logging.debug( "Skipping wrapping optimizer as it was already wrapped during the compile call." ) wrapped_optimizer = optimizer else: wrapped_optimizer = self._get_horovod_optimizer_if_using_horovod( optimizer=optimizer, ) self._wrapped_optimizers.append(wrapped_optimizer) return wrapped_optimizer
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. This default session does not have any effect within the Estimator itself. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config, session_config=None ) logging.debug("Applying tf.estimator patches.") @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate") def patch_estimator_eval_on_checkpoint(original, *args, **kwargs): # type: ignore # With a single worker and multiple devices, # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if # `input_fn` or `steps` is None, which causes an error when evaluating the # model function. Apply a monkey-patch to skip the internal function that # ultimately runs the evaluation. logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "TensorpackTrial is in use.") hvd.init() TensorpackTrialController._set_random_seeds()
def pre_execute_hook( cls: Type["TFKerasTrialController"], env: det.EnvContext, distributed_backend: det._DistributedBackend, ) -> None: # Initialize the correct horovod. if distributed_backend.use_horovod(): hvd.require_horovod_type("tensorflow.keras", "TFKerasTrial is in use.") hvd.init() # Start with a clean graph. tf.compat.v1.reset_default_graph() cls._set_random_seeds(env.trial_seed)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow.keras", "TFKerasTrial is in use.") hvd.init() # Start with a clean graph. tf.compat.v1.reset_default_graph() TFKerasTrialController._set_random_seeds(env.trial_seed) # For the Native API we must configure the Session before running user code. if env.experiment_config.native_enabled(): session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True) TFKerasTrialController._configure_session(env, hvd_config, session_config)
def pre_execute_hook( cls: Type["PyTorchTrialController"], env: det.EnvContext, distributed_backend: det._DistributedBackend, ) -> None: # Initialize the correct horovod. if distributed_backend.use_horovod(): hvd.require_horovod_type("torch", "PyTorchTrial is in use.") hvd.init() if distributed_backend.use_torch(): if torch.cuda.is_available(): dist.init_process_group(backend="nccl") # type: ignore else: dist.init_process_group(backend="gloo") # type: ignore cls._set_random_seeds(env.trial_seed)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. This default session does not have any effect within the Estimator itself. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config, session_config=None) logging.debug("Applying tf.estimator patches.") @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate") def patch_estimator_eval_on_checkpoint(original, *args, **kwargs): # type: ignore # With a single worker and multiple devices, # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if # `input_fn` or `steps` is None, which causes an error when evaluating the # model function. Apply a monkey-patch to skip the internal function that # ultimately runs the evaluation. logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
def wrap_optimizer(self, optimizer: Any) -> Any: """ This should be used to wrap optimizer objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their optimizer. For example, if users create their optimizer within ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)`` prior to passing the optimizer into their Estimator. """ if not self.env.managed_training: return optimizer self.optimizer_initialized = True if not self.hvd_config.use: return optimizer check.check_false( isinstance(optimizer, str), "Please specify an optimizer object instead of using a string name.", ) hvd.require_horovod_type( "tensorflow", "EstimatorContext.wrap_optimizer was called.") use_compression = self.hvd_config.fp16_compression # The signature of our horovod optimizer changed after we rebased onto 0.21. hvd_sig = inspect.signature(hvd.DistributedOptimizer) horovod_kwargs = { "compression": hvd.compression.Compression.fp16 if use_compression else hvd.compression.Compression.none, "average_aggregated_gradients": self.hvd_config.average_aggregated_gradients, } if "aggregation_frequency" in hvd_sig.parameters: horovod_kwargs[ "aggregation_frequency"] = self.hvd_config.aggregation_frequency else: horovod_kwargs[ "backward_passes_per_step"] = self.hvd_config.aggregation_frequency optimizer = hvd.DistributedOptimizer(optimizer, **horovod_kwargs) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) return optimizer
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # Initialize random seeds. if env.experiment_config.input_from_dataflow(): logging.debug("Using tensorpack dataflows as input.") process_rank = 0 if not hvd_config.use else hvd.rank() EstimatorTrialController.set_random_seed(env.trial_seed + process_rank) else: # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior()
def wrap_dataset(self, dataset: Any) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. E.g., If users instantiate their training dataset within ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)`` prior to passing it into ``tf.estimator.TrainSpec``. """ hvd.require_horovod_type("tensorflow", "EstimatorContext.wrap_dataset was called.") self.dataset_initialized = True if not self.hvd_config.use or self.input_from_dataflow: return dataset dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def wrap_dataset(self, dataset: Any) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. Args: dataset: tf.data.Dataset """ self.dataset_initialized = True if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset): return dataset hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.") dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset