def __init__(self, **kwargs): self._name = name or "Distributed%s" % self.__class__.__base__.__name__ self._aggregated_gradients = False self._allreduce_grads = hvd._make_allreduce_grads_fn( self._name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor) self._agg_helper = None if backward_passes_per_step > 1: if hvd._executing_eagerly(): self._agg_helper = LocalGradientAggregationHelperEager( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, ) else: self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper. _OPTIMIZER_TYPE_KERAS, ) super(self.__class__, self).__init__(**kwargs)
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0, backward_passes_per_step=1, average_aggregated_gradients=False): if name is None: name = "Distributed{}".format(type(optimizer).__name__) super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._allreduce_grads = _make_allreduce_grads_fn( name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor) self._agg_helper = None if backward_passes_per_step > 1: if _executing_eagerly(): raise ValueError( "backward_passes_per_step > 1 is not yet supported " "for _LegacyOptimizer with eager execution." ) self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients=average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_LEGACY, )
class _DistributedOptimizer(keras.optimizers.Optimizer): _HAS_AGGREGATE_GRAD = True def __init__(self, **kwargs): self._name = name or "Distributed%s" % self.__class__.__base__.__name__ self._aggregated_gradients = False self._allreduce_grads = hvd._make_allreduce_grads_fn( self._name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor) self._agg_helper = None if backward_passes_per_step > 1: if hvd._executing_eagerly(): self._agg_helper = LocalGradientAggregationHelperEager( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, ) else: self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper. _OPTIMIZER_TYPE_KERAS, ) super(self.__class__, self).__init__(**kwargs) def get_gradients(self, loss, params): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = super(self.__class__, self).get_gradients(loss, params) return self._allreduce(gradients) def _aggregate_gradients(self, grads_and_vars): grads, vars = list(zip(*grads_and_vars)) aggregated_grads = self._allreduce(grads) if _PRE_TF_2_4_0: # Prior to TF 2.4.0, this function was expected to return only a list of # grads, not a list of (grad, var) tuples. return aggregated_grads return list(zip(aggregated_grads, vars)) def _allreduce(self, grads): self._aggregated_gradients = True if self._agg_helper: return self._agg_helper.compute_gradients(tuple(grads)) else: return self._allreduce_grads(grads) def apply_gradients(self, *args, **kwargs): if self._agg_helper: results = self._agg_helper.apply_gradients( lambda: super(self.__class__, self).apply_gradients( *args, **kwargs), self, *args, **kwargs, ) else: results = super(self.__class__, self).apply_gradients(*args, **kwargs) if not self._aggregated_gradients: raise Exception( '`apply_gradients()` was called without a call to ' '`get_gradients()` or `_aggregate_gradients`. If you\'re ' 'using TensorFlow 2.0, please specify ' '`experimental_run_tf_function=False` in `compile()`.') return results
class _DistributedOptimizer(_LegacyOptimizer): """An optimizer that wraps another tf.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights.""" def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0, backward_passes_per_step=1, average_aggregated_gradients=False, num_groups=0): if name is None: name = "Distributed{}".format(type(optimizer).__name__) super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._allreduce_grads = _make_allreduce_grads_fn( name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor, num_groups) self._agg_helper = None if backward_passes_per_step > 1: if _executing_eagerly(): raise ValueError( "backward_passes_per_step > 1 is not yet supported " "for _LegacyOptimizer with eager execution.") self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients=average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper. _OPTIMIZER_TYPE_LEGACY, ) def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) grads, vars = zip(*gradients) if self._agg_helper: avg_grads = self._agg_helper.compute_gradients(grads) else: avg_grads = self._allreduce_grads(grads) return list(zip(avg_grads, vars)) def apply_gradients(self, *args, **kwargs): """Calls this same method on the underlying optimizer.""" if self._agg_helper: return self._agg_helper.apply_gradients( lambda: self._optimizer.apply_gradients(*args, **kwargs), self._optimizer, *args, **kwargs, ) return self._optimizer.apply_gradients(*args, **kwargs) def get_slot(self, *args, **kwargs): """Calls this same method on the underlying optimizer.""" return self._optimizer.get_slot(*args, **kwargs) def get_slot_names(self, *args, **kwargs): """Calls this same method on the underlying optimizer.""" return self._optimizer.get_slot_names(*args, **kwargs) def variables(self, *args, **kwargs): """Calls this same method on the underlying optimizer.""" return self._optimizer.variables(*args, **kwargs)
class _DistributedOptimizer(keras.optimizers.Optimizer): _HAS_AGGREGATE_GRAD = True def __init__(self, **kwargs): self._name = name or "Distributed%s" % self.__class__.__base__.__name__ self._aggregated_gradients = False self._allreduce_grads = hvd._make_allreduce_grads_fn( self._name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor, groups) self._agg_helper = None if backward_passes_per_step > 1: if hvd._executing_eagerly(): self._agg_helper = LocalGradientAggregationHelperEager( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients=average_aggregated_gradients, ) else: self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients=average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_KERAS, ) super(self.__class__, self).__init__(**kwargs) def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also allreduce the gradients before returning them. """ if _PRE_TF_2_4_0: return super(self.__class__, self)._compute_gradients( loss, var_list, grad_loss, tape) tape = backprop.GradientTape() if tape is None else tape grads_and_vars = super(self.__class__, self)._compute_gradients( # pylint: disable=protected-access loss, var_list, grad_loss, tape=tape) grads, weights = list(zip(*grads_and_vars)) allreduced_grads = self._allreduce(grads, weights) return list(zip(allreduced_grads, weights)) def get_gradients(self, loss, params): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = super(self.__class__, self).get_gradients(loss, params) return self._allreduce(gradients, params) def _aggregate_gradients(self, grads_and_vars): if _PRE_TF_2_4_0: grads, vars = list(zip(*grads_and_vars)) aggregated_grads = self._allreduce(grads, vars) return aggregated_grads else: return super(self.__class__, self)._aggregate_gradients( grads_and_vars) def _allreduce(self, grads, vars): self._aggregated_gradients = True if self._agg_helper: return self._agg_helper.compute_gradients(tuple(grads), tuple(vars)) else: return self._allreduce_grads(grads, vars) def apply_gradients(self, *args, **kwargs): if self._agg_helper: if isinstance(args[0], zip): # If grad_and_vars are passed in as a zip object # convert to a list. This is necessary for TF2.4+ # b/c args[0] is used in both conditional branches # inside _agg_helper.apply_gradients(). args = list(args) args[0] = list(args[0]) args = tuple(args) results = self._agg_helper.apply_gradients( lambda: super(self.__class__, self).apply_gradients(*args, **kwargs), self, *args, **kwargs, ) else: results = super(self.__class__, self).apply_gradients(*args, **kwargs) if _PRE_TF_2_4_0 and not self._aggregated_gradients: raise Exception('`apply_gradients()` was called without a call to ' '`get_gradients()` or `_aggregate_gradients`. If you\'re ' 'using TensorFlow 2.0, please specify ' '`experimental_run_tf_function=False` in `compile()`.') return results