Пример #1
0
    def DistributedGradientTape(gradtape,
                                device_dense='',
                                device_sparse='',
                                compression=Compression.none,
                                sparse_as_dense=False,
                                op=Average,
                                gradient_predivide_factor=1.0):
        """A tape that wraps another tf.GradientTape, using an allreduce to
        combine gradient values before applying gradients to model weights.

        Args:
          gradtape:
            GradientTape to use for computing gradients and applying updates.
          device_dense:
            Device to be used for dense tensors. Uses GPU by default
            if Horovod was built with HOROVOD_GPU_OPERATIONS.
          device_sparse:
            Device to be used for sparse tensors. Uses GPU by default
            if Horovod was built with HOROVOD_GPU_OPERATIONS.
          compression:
            Compression algorithm used during allreduce to reduce the amount
            of data sent during each parameter update step.  Defaults to
            not using compression.
          sparse_as_dense:
            Treat all sparse gradients as dense tensors.  This can help improve
            performance and memory utilization if the original sparse gradient
            has high density.  Defaults to false.
          op:
            The reduction operation to use when combining gradients across
            different ranks.
          gradient_predivide_factor:
            If op == Average, gradient_predivide_factor splits the averaging
            before and after the sum. Gradients are scaled by
            1.0 / gradient_predivide_factor before the sum and
            gradient_predivide_factor / size after the sum.
        """
        if gradient_predivide_factor != 1.0:
            if rocm_built():
                raise ValueError(
                    'gradient_predivide_factor not supported yet with ROCm')
            if op != Average:
                raise ValueError(
                    'gradient_predivide_factor not supported with op != Average'
                )

        cls = type(gradtape.__class__.__name__, (gradtape.__class__, ),
                   dict(_DistributedGradientTape.__dict__))
        if hasattr(gradtape, '_watch_accessed_variables'):
            return cls(gradtape._tape, device_dense, device_sparse,
                       compression, sparse_as_dense, op,
                       gradient_predivide_factor, gradtape._persistent,
                       gradtape._watch_accessed_variables)
        else:
            return cls(gradtape._tape, device_dense, device_sparse,
                       compression, sparse_as_dense, op,
                       gradient_predivide_factor, gradtape._persistent)
Пример #2
0
def DistributedOptimizer(optimizer,
                         name=None,
                         use_locking=False,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         backward_passes_per_step=1,
                         op=Average,
                         gradient_predivide_factor=1.0,
                         average_aggregated_gradients=False,
                         num_groups=0):
    """Construct a new DistributedOptimizer, which uses another optimizer
    under the hood for computing single-process gradient values and
    applying gradient updates after the gradient values have been combined
    across all the Horovod ranks.

    Args:
      optimizer:
        Optimizer to use for computing gradients and applying updates.
      name:
        Optional name prefix for the operations created when applying
        gradients. Defaults to "Distributed" followed by the provided
        optimizer type.
      use_locking:
        Whether to use locking when updating variables.
        See Optimizer.__init__ for more info.
      device_dense:
        Device to be used for dense tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      device_sparse:
        Device to be used for sparse tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      compression:
        Compression algorithm used during allreduce to reduce the amount
        of data sent during each parameter update step.  Defaults to
        not using compression.
      sparse_as_dense:
        Treat all sparse gradients as dense tensors.  This can help improve
        performance and memory utilization if the original sparse gradient
        has high density.  Defaults to false.
      backward_passes_per_step:
        Number of backward passes to perform before calling hvd.allreduce.
        This allows accumulating updates over multiple mini-batches before
        reducing and applying them.
      op:
        The reduction operation to use when combining gradients across
        different ranks.
      gradient_predivide_factor:
        If op == Average, gradient_predivide_factor splits the averaging
        before and after the sum. Gradients are scaled by
        1.0 / gradient_predivide_factor before the sum and
        gradient_predivide_factor / size after the sum.
      average_aggregated_gradients:
        Whether to average the aggregated gradients that have been accumulated
        over multiple mini-batches. If true divides gradients updates by
        backward_passes_per_step. Only applicable for backward_passes_per_step > 1.
      num_groups:
        Number of groups to assign gradient allreduce ops to for explicit
        grouping. Defaults to no explicit groups.
    """
    if gradient_predivide_factor != 1.0:
        if rocm_built():
            raise ValueError(
                'gradient_predivide_factor not supported yet with ROCm')
        if op != Average:
            raise ValueError(
                'gradient_predivide_factor not supported with op != Average')

    if op == Adasum and average_aggregated_gradients:
        raise ValueError(
            'Adasum does not support average_aggregated_gradients == True')

    if isinstance(optimizer, _LegacyOptimizer):
        if op == Adasum:
            return _DistributedAdasumOptimizer(optimizer, name, use_locking,
                                               device_dense, device_sparse,
                                               compression,
                                               backward_passes_per_step)

        return _DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
            num_groups=num_groups)
    elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
        if op == Adasum:
            raise ValueError('op == Adasum is not supported yet with Keras')

        import horovod.tensorflow.keras as hvd_k
        return hvd_k.DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
        )
    else:
        raise ValueError(
            'Provided optimizer doesn\'t inherit from either legacy '
            'TensorFlow or Keras optimizer: %s' % optimizer)
Пример #3
0
def allreduce(tensor,
              average=None,
              device_dense='',
              device_sparse='',
              compression=Compression.none,
              op=None,
              prescale_factor=1.0,
              postscale_factor=1.0,
              name=None):
    """Perform an allreduce on a tf.Tensor or tf.IndexedSlices.

    This function performs a bandwidth-optimal ring allreduce on the input
    tensor. If the input is an tf.IndexedSlices, the function instead does an
    allgather on the values and the indices, effectively doing an allreduce on
    the represented tensor.

    Arguments:
        tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
                The shape of the input must be identical across all ranks.
        average:
            .. warning:: .. deprecated:: 0.19.0

                Use `op` instead. Will be removed in v0.21.0.

        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was built with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was built with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        op: The reduction operation to combine tensors across different ranks.
            Defaults to Average if None is given.
        prescale_factor: Multiplicative factor to scale tensor before allreduce.
        postscale_factor: Multiplicative factor to scale tensor after allreduce.
        name: A name of the allreduce operation

    Returns:
        A tensor of the same shape and type as `tensor`, summed across all
        processes.
    """
    op = handle_average_backwards_compatibility(op, average)

    if isinstance(tensor, tf.IndexedSlices):
        # TODO: Need to fix this to actuall call Adasum
        if op == Adasum:
            raise NotImplementedError(
                'The Adasum reduction does not currently support sparse tensors. As a '
                'workaround please pass sparse_as_dense=True to DistributedOptimizer'
            )
        with tf.device(device_sparse):
            # For IndexedSlices, do two allgathers instead of an allreduce.
            horovod_size = tf.cast(size_op() if int(
                os.environ.get("HOROVOD_ELASTIC", 0)) else size(),
                                   dtype=tensor.values.dtype)
            values = allgather(tensor.values)
            indices = allgather(tensor.indices)

            # To make this operation into an average, divide allgathered values by
            # the Horovod size.
            new_values = (values / horovod_size) if op == Average else values
        return tf.IndexedSlices(new_values,
                                indices,
                                dense_shape=tensor.dense_shape)
    else:
        average_in_framework = False
        if rocm_built():
            # For ROCm, perform averaging at framework level
            average_in_framework = op == Average or op == Adasum
            op = Sum if op == Average else op

        with tf.device(device_dense):
            horovod_size = tf.cast(size_op() if int(
                os.environ.get("HOROVOD_ELASTIC", 0)) else size(),
                                   dtype=tensor.dtype)
            tensor_compressed, ctx = compression.compress(tensor)
            summed_tensor_compressed = _allreduce(
                tensor_compressed,
                op=op,
                prescale_factor=prescale_factor,
                postscale_factor=postscale_factor,
                name=name)
            summed_tensor = compression.decompress(summed_tensor_compressed,
                                                   ctx)
            if op == Adasum:
                if 'CPU' not in tensor.device and gpu_available('tensorflow'):
                    if nccl_built():
                        if not is_homogeneous:
                            raise NotImplementedError(
                                'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                            )
                        elif not check_num_rank_power_of_2(
                                int(size() / local_size())):
                            raise NotImplementedError(
                                'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                            )
                        if rocm_built():
                            horovod_local_size = tf.cast(
                                local_size_op() if int(
                                    os.environ.get("HOROVOD_ELASTIC",
                                                   0)) else local_size(),
                                dtype=tensor.dtype)
                            new_tensor = summed_tensor / horovod_local_size
                        else:
                            new_tensor = summed_tensor
                    else:
                        warnings.warn(
                            'Adasum reduction does not currently support GPU reduction using MPI. Tensors '
                            'are copied to CPU memory instead. To use Adasum for GPU reduction, please '
                            'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.'
                        )
                        new_tensor = summed_tensor
                else:
                    if not check_num_rank_power_of_2(size()):
                        raise NotImplementedError(
                            'Running Adasum with non-power of 2 ranks is not supported yet.'
                        )
                    new_tensor = summed_tensor
            else:
                if rocm_built():
                    new_tensor = (summed_tensor / horovod_size
                                  ) if average_in_framework else summed_tensor
                else:
                    new_tensor = summed_tensor
        return new_tensor
Пример #4
0
def grouped_allreduce(tensors,
                      average=None,
                      device_dense='',
                      device_sparse='',
                      compression=Compression.none,
                      op=None,
                      prescale_factor=1.0,
                      postscale_factor=1.0):
    if not tensors:
        return tensors

    op = handle_average_backwards_compatibility(op, average)

    average_in_framework = False
    if rocm_built():
        # For ROCm, perform averaging at framework level
        average_in_framework = op == Average or op == Adasum
        op = Sum if op == Average else op

    if any(isinstance(t, tf.IndexedSlices) for t in tensors):
        # TODO: Need to fix this to actuall call Adasum
        if op == Adasum:
            raise NotImplementedError(
                'The Adasum reduction does not currently support sparse tensors. As a '
                'workaround please pass sparse_as_dense=True to DistributedOptimizer'
            )
        with tf.device(device_sparse):
            new_values = []
            for tensor in tensors:
                # For IndexedSlices, do two allgathers instead of an allreduce.
                horovod_size = tf.cast(size_op() if int(
                    os.environ.get("HOROVOD_ELASTIC", 0)) else size(),
                                       dtype=tensor.values.dtype)
                values = allgather(tensor.values)
                indices = allgather(tensor.indices)

                # To make this operation into an average, divide allgathered values by
                # the Horovod size.
                new_values += (values /
                               horovod_size) if op == Average else values
        return [
            tf.IndexedSlices(x, indices, dense_shape=t.dense_shape)
            for x, t in zip(new_values, tensors)
        ]
    else:
        with tf.device(device_dense):
            tensors_compressed, ctxs = zip(
                *[compression.compress(tensor) for tensor in tensors])
            summed_tensors_compressed = _grouped_allreduce(
                tensors_compressed,
                op=op,
                prescale_factor=prescale_factor,
                postscale_factor=postscale_factor)
            summed_tensors = [
                compression.decompress(t, ctx)
                for t, ctx in zip(summed_tensors_compressed, ctxs)
            ]
            if op == Adasum:
                if 'CPU' not in tensor.device and gpu_available('tensorflow'):
                    if nccl_built():
                        if not is_homogeneous:
                            raise NotImplementedError(
                                'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                            )
                        elif not check_num_rank_power_of_2(
                                int(size() / local_size())):
                            raise NotImplementedError(
                                'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                            )
                        if rocm_built():
                            new_tensors = []
                            for tensor in summed_tensors:
                                horovod_local_size = tf.cast(
                                    local_size_op() if int(
                                        os.environ.get("HOROVOD_ELASTIC",
                                                       0)) else local_size(),
                                    dtype=tensor.dtype)
                                new_tensors += tensor / horovod_local_size
                        else:
                            new_tensors = summed_tensors
                    else:
                        warnings.warn(
                            'Adasum reduction does not currently support GPU reduction using MPI. Tensors '
                            'are copied to CPU memory instead. To use Adasum for GPU reduction, please '
                            'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.'
                        )
                        new_tensors = summed_tensors
                else:
                    if not check_num_rank_power_of_2(size()):
                        raise NotImplementedError(
                            'Running Adasum with non-power of 2 ranks is not supported yet.'
                        )
                    new_tensors = summed_tensors
            else:
                if rocm_built():
                    new_tensors = []
                    for tensor in summed_tensors:
                        horovod_size = tf.cast(size_op() if int(
                            os.environ.get("HOROVOD_ELASTIC", 0)) else size(),
                                               dtype=tensor.dtype)
                        new_tensors += (
                            tensor /
                            horovod_size) if average_in_framework else tensor
                else:
                    new_tensors = summed_tensors
        return new_tensors
Пример #5
0
    def DistributedGradientTape(gradtape,
                                device_dense='',
                                device_sparse='',
                                compression=Compression.none,
                                sparse_as_dense=False,
                                op=Average,
                                gradient_predivide_factor=1.0,
                                num_groups=0,
                                groups=None):
        """A tape that wraps another tf.GradientTape, using an allreduce to
        combine gradient values before applying gradients to model weights.

        Args:
          gradtape:
            GradientTape to use for computing gradients and applying updates.
          device_dense:
            Device to be used for dense tensors. Uses GPU by default
            if Horovod was built with HOROVOD_GPU_OPERATIONS.
          device_sparse:
            Device to be used for sparse tensors. Uses GPU by default
            if Horovod was built with HOROVOD_GPU_OPERATIONS.
          compression:
            Compression algorithm used during allreduce to reduce the amount
            of data sent during each parameter update step.  Defaults to
            not using compression.
          sparse_as_dense:
            Treat all sparse gradients as dense tensors.  This can help improve
            performance and memory utilization if the original sparse gradient
            has high density.  Defaults to false.
          op:
            The reduction operation to use when combining gradients across
            different ranks.
          gradient_predivide_factor:
            If op == Average, gradient_predivide_factor splits the averaging
            before and after the sum. Gradients are scaled by
            1.0 / gradient_predivide_factor before the sum and
            gradient_predivide_factor / size after the sum.
          num_groups:
            Number of groups to assign gradient allreduce ops to for explicit
            grouping. Defaults to no explicit groups.
          groups:
            The parameter to group the gradient allreduce ops. Accept values is a
            non-negative integer or a list of list of tf.Variable.
            If groups is a non-negative integer, it is the number of groups to assign
            gradient allreduce ops to for explicit grouping.
            If groups is a list of list of tf.Variable. Variables in the same
            inner list will be assigned to the same group, while parameter that does
            not appear in any list will form a group itself.
            Defaults as None, which is no explicit groups.
        """
        if gradient_predivide_factor != 1.0:
            if rocm_built():
                raise ValueError(
                    'gradient_predivide_factor not supported yet with ROCm')
            if op != Average:
                raise ValueError(
                    'gradient_predivide_factor not supported with op != Average'
                )

        if num_groups != 0:
            warnings.warn(
                'Parameter `num_groups` has been replaced by `groups` '
                'and will be removed in v0.23.0.', DeprecationWarning)
            if groups is None:
                groups = num_groups

        if groups is not None:
            if not (isinstance(groups, list) or groups > 0):
                raise ValueError('groups should be a non-negative integer or '
                                 'a list of list of tf.Variable.')

        cls = type(gradtape.__class__.__name__, (gradtape.__class__, ),
                   dict(_DistributedGradientTape.__dict__))
        if hasattr(gradtape, '_watch_accessed_variables'):
            return cls(gradtape._tape, device_dense, device_sparse,
                       compression, sparse_as_dense, op,
                       gradient_predivide_factor, groups, gradtape._persistent,
                       gradtape._watch_accessed_variables)
        else:
            return cls(gradtape._tape, device_dense, device_sparse,
                       compression, sparse_as_dense, op,
                       gradient_predivide_factor, groups, gradtape._persistent)
Пример #6
0
def DistributedOptimizer(optimizer,
                         name=None,
                         use_locking=False,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         backward_passes_per_step=1,
                         op=Average,
                         gradient_predivide_factor=1.0,
                         average_aggregated_gradients=False,
                         num_groups=0,
                         groups=None,
                         process_set=global_process_set):
    """Construct a new DistributedOptimizer, which uses another optimizer
    under the hood for computing single-process gradient values and
    applying gradient updates after the gradient values have been combined
    across all the Horovod ranks.

    Args:
      optimizer:
        Optimizer to use for computing gradients and applying updates.
      name:
        Optional name prefix for the operations created when applying
        gradients. Defaults to "Distributed" followed by the provided
        optimizer type.
      use_locking:
        Whether to use locking when updating variables.
        See Optimizer.__init__ for more info.
      device_dense:
        Device to be used for dense tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      device_sparse:
        Device to be used for sparse tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      compression:
        Compression algorithm used during allreduce to reduce the amount
        of data sent during each parameter update step.  Defaults to
        not using compression.
      sparse_as_dense:
        Treat all sparse gradients as dense tensors.  This can help improve
        performance and memory utilization if the original sparse gradient
        has high density.  Defaults to false.
      backward_passes_per_step:
        Number of backward passes to perform before calling hvd.allreduce.
        This allows accumulating updates over multiple mini-batches before
        reducing and applying them.
      op:
        The reduction operation to use when combining gradients across
        different ranks.
      gradient_predivide_factor:
        If op == Average, gradient_predivide_factor splits the averaging
        before and after the sum. Gradients are scaled by
        1.0 / gradient_predivide_factor before the sum and
        gradient_predivide_factor / size after the sum.
      average_aggregated_gradients:
        Whether to average the aggregated gradients that have been accumulated
        over multiple mini-batches. If true divides gradients updates by
        backward_passes_per_step. Only applicable for backward_passes_per_step > 1.
      num_groups:
        Number of groups to assign gradient allreduce ops to for explicit
        grouping. Defaults to no explicit groups.
      groups:
        The parameter to group the gradient allreduce ops. Accept values is a
        non-negative integer or a list of list of tf.Variable.
        If groups is a non-negative integer, it is the number of groups to assign
        gradient allreduce ops to for explicit grouping.
        If groups is a list of list of tf.Variable. Variables in the same
        inner list will be assigned to the same group, while parameter that does
        not appear in any list will form a group itself.
        Defaults as None, which is no explicit groups.
      process_set: Gradients will only be reduced over Horovod processes belonging
        to this process set. Defaults to the global process set.
    """
    if gradient_predivide_factor != 1.0:
        if rocm_built():
            raise ValueError(
                'gradient_predivide_factor not supported yet with ROCm')
        if op != Average:
            raise ValueError(
                'gradient_predivide_factor not supported with op != Average')

    if op == Adasum and average_aggregated_gradients:
        raise ValueError(
            'Adasum does not support average_aggregated_gradients == True')

    if num_groups != 0:
        warnings.warn(
            'Parameter `num_groups` has been replaced by `groups` '
            'and will be removed in v0.23.0.', DeprecationWarning)
        if groups is None:
            groups = num_groups

    if groups is not None:
        if not (isinstance(groups, list) or groups > 0):
            raise ValueError('groups should be a non-negative integer or '
                             'a list of list of tf.Variable.')

    if isinstance(optimizer, _LegacyOptimizer):
        if op == Adasum:
            if process_set.process_set_id != 0:
                raise NotImplementedError(
                    "Adasum does not support process sets yet")
            return _DistributedAdasumOptimizer(optimizer, name, use_locking,
                                               device_dense, device_sparse,
                                               compression,
                                               backward_passes_per_step)

        return _DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
            groups=groups,
            process_set=process_set,
        )
    elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
        if op == Adasum:
            raise ValueError('op == Adasum is not supported yet with Keras')

        import horovod.tensorflow.keras as hvd_k
        return hvd_k.DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
            process_set=process_set,
        )
    else:
        raise ValueError(
            'Provided optimizer doesn\'t inherit from either legacy '
            'TensorFlow or Keras optimizer: %s' % optimizer)