예제 #1
0
파일: __init__.py 프로젝트: zuston/horovod
    class _DistributedOptimizer(_LegacyOptimizer):
        """An optimizer that wraps another tf.Optimizer, using an allreduce to
        combine gradient values before applying gradients to model weights."""
        def __init__(self,
                     optimizer,
                     name=None,
                     use_locking=False,
                     device_dense='',
                     device_sparse='',
                     compression=Compression.none,
                     sparse_as_dense=False,
                     op=Average,
                     gradient_predivide_factor=1.0,
                     backward_passes_per_step=1,
                     average_aggregated_gradients=False,
                     num_groups=0):
            if name is None:
                name = "Distributed{}".format(type(optimizer).__name__)
            super(_DistributedOptimizer,
                  self).__init__(name=name, use_locking=use_locking)

            self._optimizer = optimizer
            self._allreduce_grads = _make_allreduce_grads_fn(
                name, device_dense, device_sparse, compression,
                sparse_as_dense, op, gradient_predivide_factor, num_groups)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if _executing_eagerly():
                    raise ValueError(
                        "backward_passes_per_step > 1 is not yet supported "
                        "for _LegacyOptimizer with eager execution.")

                self._agg_helper = LocalGradientAggregationHelper(
                    backward_passes_per_step=backward_passes_per_step,
                    allreduce_func=self._allreduce_grads,
                    sparse_as_dense=sparse_as_dense,
                    average_aggregated_gradients=average_aggregated_gradients,
                    rank=rank(),
                    optimizer_type=LocalGradientAggregationHelper.
                    _OPTIMIZER_TYPE_LEGACY,
                )

        def compute_gradients(self, *args, **kwargs):
            """Compute gradients of all trainable variables.

            See Optimizer.compute_gradients() for more info.

            In DistributedOptimizer, compute_gradients() is overriden to also
            allreduce the gradients before returning them.
            """
            gradients = self._optimizer.compute_gradients(*args, **kwargs)
            grads, vars = zip(*gradients)
            if self._agg_helper:
                avg_grads = self._agg_helper.compute_gradients(grads)
            else:
                avg_grads = self._allreduce_grads(grads)
            return list(zip(avg_grads, vars))

        def apply_gradients(self, *args, **kwargs):
            """Calls this same method on the underlying optimizer."""
            if self._agg_helper:
                return self._agg_helper.apply_gradients(
                    lambda: self._optimizer.apply_gradients(*args, **kwargs),
                    self._optimizer,
                    *args,
                    **kwargs,
                )

            return self._optimizer.apply_gradients(*args, **kwargs)

        def get_slot(self, *args, **kwargs):
            """Calls this same method on the underlying optimizer."""
            return self._optimizer.get_slot(*args, **kwargs)

        def get_slot_names(self, *args, **kwargs):
            """Calls this same method on the underlying optimizer."""
            return self._optimizer.get_slot_names(*args, **kwargs)

        def variables(self, *args, **kwargs):
            """Calls this same method on the underlying optimizer."""
            return self._optimizer.variables(*args, **kwargs)
예제 #2
0
파일: __init__.py 프로젝트: yikkin/horovod
    class _DistributedOptimizer(keras.optimizers.Optimizer):
        _HAS_AGGREGATE_GRAD = True

        def __init__(self, **kwargs):
            self._name = name or "Distributed%s" % self.__class__.__base__.__name__
            self._aggregated_gradients = False

            self._allreduce_grads = hvd._make_allreduce_grads_fn(
                self._name, device_dense, device_sparse, compression,
                sparse_as_dense, op, gradient_predivide_factor)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if hvd._executing_eagerly():
                    self._agg_helper = LocalGradientAggregationHelperEager(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                    )
                else:
                    self._agg_helper = LocalGradientAggregationHelper(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                        rank=rank(),
                        optimizer_type=LocalGradientAggregationHelper.
                        _OPTIMIZER_TYPE_KERAS,
                    )

            super(self.__class__, self).__init__(**kwargs)

        def get_gradients(self, loss, params):
            """
            Compute gradients of all trainable variables.

            See Optimizer.get_gradients() for more info.

            In DistributedOptimizer, get_gradients() is overriden to also
            allreduce the gradients before returning them.
            """
            gradients = super(self.__class__, self).get_gradients(loss, params)
            return self._allreduce(gradients)

        def _aggregate_gradients(self, grads_and_vars):
            grads, vars = list(zip(*grads_and_vars))
            aggregated_grads = self._allreduce(grads)
            if _PRE_TF_2_4_0:
                # Prior to TF 2.4.0, this function was expected to return only a list of
                # grads, not a list of (grad, var) tuples.
                return aggregated_grads
            return list(zip(aggregated_grads, vars))

        def _allreduce(self, grads):
            self._aggregated_gradients = True

            if self._agg_helper:
                return self._agg_helper.compute_gradients(tuple(grads))
            else:
                return self._allreduce_grads(grads)

        def apply_gradients(self, *args, **kwargs):
            if self._agg_helper:
                results = self._agg_helper.apply_gradients(
                    lambda: super(self.__class__, self).apply_gradients(
                        *args, **kwargs),
                    self,
                    *args,
                    **kwargs,
                )
            else:
                results = super(self.__class__,
                                self).apply_gradients(*args, **kwargs)

            if not self._aggregated_gradients:
                raise Exception(
                    '`apply_gradients()` was called without a call to '
                    '`get_gradients()` or `_aggregate_gradients`. If you\'re '
                    'using TensorFlow 2.0, please specify '
                    '`experimental_run_tf_function=False` in `compile()`.')

            return results
예제 #3
0
    class _DistributedOptimizer(keras.optimizers.Optimizer):
        _HAS_AGGREGATE_GRAD = True

        def __init__(self, **kwargs):
            self._name = name or "Distributed%s" % self.__class__.__base__.__name__
            self._aggregated_gradients = False

            self._allreduce_grads = hvd._make_allreduce_grads_fn(
                self._name,
                device_dense,
                device_sparse,
                compression,
                sparse_as_dense,
                op,
                gradient_predivide_factor,
                groups)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if hvd._executing_eagerly():
                    self._agg_helper = LocalGradientAggregationHelperEager(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=average_aggregated_gradients,
                    )
                else:
                    self._agg_helper = LocalGradientAggregationHelper(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=average_aggregated_gradients,
                        rank=rank(),
                        optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_KERAS,
                    )

            super(self.__class__, self).__init__(**kwargs)

        def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
            """
            Compute gradients of all trainable variables.

            See Optimizer.get_gradients() for more info.

            In DistributedOptimizer, get_gradients() is overriden to also
            allreduce the gradients before returning them.
            """
            if _PRE_TF_2_4_0:
                return super(self.__class__, self)._compute_gradients(
                    loss, var_list, grad_loss, tape)

            tape = backprop.GradientTape() if tape is None else tape
            grads_and_vars = super(self.__class__, self)._compute_gradients(
                # pylint: disable=protected-access
                loss,
                var_list,
                grad_loss,
                tape=tape)
            grads, weights = list(zip(*grads_and_vars))

            allreduced_grads = self._allreduce(grads, weights)
            return list(zip(allreduced_grads, weights))

        def get_gradients(self, loss, params):
            """
            Compute gradients of all trainable variables.

            See Optimizer.get_gradients() for more info.

            In DistributedOptimizer, get_gradients() is overriden to also
            allreduce the gradients before returning them.
            """
            gradients = super(self.__class__, self).get_gradients(loss, params)
            return self._allreduce(gradients, params)

        def _aggregate_gradients(self, grads_and_vars):
            if _PRE_TF_2_4_0:
                grads, vars = list(zip(*grads_and_vars))
                aggregated_grads = self._allreduce(grads, vars)
                return aggregated_grads
            else:
                return super(self.__class__, self)._aggregate_gradients(
                    grads_and_vars)

        def _allreduce(self, grads, vars):
            self._aggregated_gradients = True

            if self._agg_helper:
                return self._agg_helper.compute_gradients(tuple(grads), tuple(vars))
            else:
                return self._allreduce_grads(grads, vars)

        def apply_gradients(self, *args, **kwargs):
            if self._agg_helper:
                if isinstance(args[0], zip):
                    # If grad_and_vars are passed in as a zip object
                    # convert to a list. This is necessary for TF2.4+
                    # b/c args[0] is used in both conditional branches
                    # inside _agg_helper.apply_gradients().
                    args = list(args)
                    args[0] = list(args[0])
                    args = tuple(args)

                results = self._agg_helper.apply_gradients(
                    lambda: super(self.__class__, self).apply_gradients(*args, **kwargs),
                    self,
                    *args,
                    **kwargs,
                )
            else:
                results = super(self.__class__, self).apply_gradients(*args, **kwargs)

            if _PRE_TF_2_4_0 and not self._aggregated_gradients:
                raise Exception('`apply_gradients()` was called without a call to '
                                '`get_gradients()` or `_aggregate_gradients`. If you\'re '
                                'using TensorFlow 2.0, please specify '
                                '`experimental_run_tf_function=False` in `compile()`.')

            return results