def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, num_groups=0): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, num_groups=num_groups, )
def DistributedOptimizer( optimizer, name=None, use_locking=False, device_dense="", device_sparse="", compression=hvd.Compression.none, sparse_as_dense=False, backward_passes_per_step=1, op=hvd.Average, gradient_predivide_factor=1.0, average_aggregated_gradients=False, num_groups=0, fixed_global_batch_size=False, hvd_max_size=None, ): """Construct a new DistributedOptimizer, which uses another optimizer under the hood for computing single-process gradient values and applying gradient updates after the gradient values have been combined across all the Horovod ranks. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. use_locking: Whether to use locking when updating variables. See Optimizer.__init__ for more info. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was built with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was built with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used during allreduce to reduce the amount of data sent during each parameter update step. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. backward_passes_per_step: Number of backward passes to perform before calling hvd.allreduce. This allows accumulating updates over multiple mini-batches before reducing and applying them. op: The reduction operation to use when combining gradients across different ranks. gradient_predivide_factor: If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. average_aggregated_gradients: Whether to average the aggregated gradients that have been accumulated over multiple mini-batches. If true divides gradients updates by backward_passes_per_step. Only applicable for backward_passes_per_step > 1. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. fixed_global_batch_size: Whether to keep the global batch size is fixed even though the worker number is changing during elastic execution. hvd_max_size: The maximum horovod size for the elastic training. """ # *ElasticDL Update*: If `fixed_global_batch_size` == False, # just fallback to the native horovod DistributedOptimizer. if not fixed_global_batch_size: return hvd.DistributedOptimizer( optimizer=optimizer, name=name, use_locking=use_locking, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, backward_passes_per_step=backward_passes_per_step, op=op, gradient_predivide_factor=gradient_predivide_factor, average_aggregated_gradients=average_aggregated_gradients, num_groups=num_groups, ) if gradient_predivide_factor != 1.0: if hvd.rocm_built(): raise ValueError( "gradient_predivide_factor not supported yet with ROCm") if op != hvd.Average: raise ValueError( "gradient_predivide_factor not supported with op != Average") if op == hvd.Adasum and average_aggregated_gradients: raise ValueError( "Adasum does not support average_aggregated_gradients == True") if isinstance(optimizer, _LegacyOptimizer): if op == hvd.Adasum: raise ValueError( """op == Adasum and fixed_global_batch_size == True is not yet supported""") hvd_max_size = complement_value_from_env_if_none( hvd_max_size, "WORKER_NUM", int, 1) global_batch_count_per_step = hvd_max_size * backward_passes_per_step opt = _DistributedOptimizer( optimizer=optimizer, name=name, use_locking=use_locking, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, op=op, gradient_predivide_factor=gradient_predivide_factor, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=average_aggregated_gradients, num_groups=num_groups, global_batch_count_per_step=global_batch_count_per_step, ) optimizer_instances.append(opt) return opt elif isinstance(optimizer, tf.keras.optimizers.Optimizer): raise ValueError( "fixed_global_batch_size == True is not supported yet with Keras") else: raise ValueError( "Provided optimizer doesn't inherit from either legacy " "TensorFlow or Keras optimizer: %s" % optimizer)
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, backward_passes_per_step=1, average_aggregated_gradients=False): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. backward_passes_per_step: Number of backward passes to perform before calling hvd.allreduce. This allows accumulating updates over multiple mini-batches before reducing and applying them. average_aggregated_gradients: Whether to average the aggregated gradients that have been accumulated over multiple mini-batches. If true divides gradient updates by backward_passes_per_step. Only applicable for backward_passes_per_step > 1. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=average_aggregated_gradients, )
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, backward_passes_per_step=1, average_aggregated_gradients=False, num_groups=0, groups=None, process_set=global_process_set): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. backward_passes_per_step: Number of backward passes to perform before calling hvd.allreduce. This allows accumulating updates over multiple mini-batches before reducing and applying them. average_aggregated_gradients: Whether to average the aggregated gradients that have been accumulated over multiple mini-batches. If true divides gradient updates by backward_passes_per_step. Only applicable for backward_passes_per_step > 1. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. groups: The parameter to group the gradient allreduce ops. Accept values is a non-negative integer or a list of list of tf.Variable. If groups is a non-negative integer, it is the number of groups to assign gradient allreduce ops to for explicit grouping. If groups is a list of list of tf.Variable. Variables in the same inner list will be assigned to the same group, while parameter that does not appear in any list will form a group itself. Defaults as None, which is no explicit groups. process_set: Gradients will only be reduced over Horovod processes belonging to this process set. Defaults to the global process set. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') if num_groups != 0: warnings.warn( 'Parameter `num_groups` has been replaced by `groups` ' 'and will be removed in v0.23.0.', DeprecationWarning) if groups is None: groups = num_groups if groups is not None: if not (isinstance(groups, list) or groups > 0): raise ValueError('groups should be a non-negative integer or ' 'a list of list of tf.Variable.') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=average_aggregated_gradients, groups=groups, process_set=process_set, )