Пример #1
0
def push_pull(tensor, scope='', average=True, device_dense='', device_sparse='',
              compression=Compression.none, enable_async=False):
    """Perform an push_pull on a tf.Tensor or tf.IndexedSlices.
    Arguments:
        tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
                The shape of the input must be identical across all ranks.
        scope: the graph name scope
        average: If True, computes the average over all ranks.
                 Otherwise, computes the sum over all ranks.
        device_dense: Device to be used for dense tensors. Uses GPU by default.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
    Returns:
        A tensor of the same shape and type as `tensor`, summed across all
        processes.
    """
    with tf.device(device_dense):
        byteps_size = tf.cast(size(), dtype=tensor.dtype)
        tensor_compressed, ctx = compression.compress(tensor)
        summed_tensor_compressed = _push_pull(tensor_compressed, scope)
        summed_tensor = compression.decompress(summed_tensor_compressed, ctx)
        if not enable_async:
            new_tensor = (tf.div(summed_tensor, byteps_size)
                          if average else summed_tensor)
        else: # no need to average for async training
            new_tensor = summed_tensor
    return new_tensor
Пример #2
0
 def gradient(self, target, sources, output_gradients=None):
     gradients = super(self.__class__, self).gradient(target, sources, output_gradients)
     if size() > 1:
         avg_grads = self._push_pull_grads(gradients)
         return avg_grads
     else:
         return gradients
Пример #3
0
 def compute_gradients(self, *args, **kwargs):
     """Compute gradients of all trainable variables.
     See Optimizer.compute_gradients() for more info.
     In DistributedOptimizer, compute_gradients() is overriden to also
     push_pull the gradients before returning them.
     """
     gradients = self._optimizer.compute_gradients(*args, **kwargs)
     if size() > 1 and not self._enable_async:
         grads, vars = zip(*gradients)
         avg_grads = self._push_pull_grads(grads)
         return list(zip(avg_grads, vars))
     else:
         return gradients
Пример #4
0
def broadcast_variables(variables, root_rank, scope=''):
    """Broadcasts variables from root rank to all other processes.
    Arguments:
        variables: variables for broadcast
        root_rank: rank of the process from which global variables will be broadcasted
                   to all other processes.
        scope: the graph name scope
    """
    if size() <= 1:
        return variables
    _assign = tf.assign if hasattr(tf, 'assign') else tf.compat.v1.assign
    return tf.group(
        *[_assign(var, broadcast(var, root_rank, scope)) for var in variables])
Пример #5
0
def push_pull(tensor,
              scope='',
              average=None,
              device_dense='',
              device_sparse='',
              compression=Compression.none,
              op=None,
              enable_async=False):
    """Perform an push_pull on a tf.Tensor or tf.IndexedSlices.
    Arguments:
        tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
                The shape of the input must be identical across all ranks.
        average:
            .. warning:: .. deprecated

                Use `op` instead. Will be removed.

        scope: the graph name scope
        average: If True, computes the average over all ranks.
                 Otherwise, computes the sum over all ranks.
        device_dense: Device to be used for dense tensors. Uses GPU by default.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        op: The reduction operation to combine tensors across different ranks.
            Defaults to Average if None is given.

    Returns:
        A tensor of the same shape and type as `tensor`, summed across all
        processes.
    """
    op = handle_average_backwards_compatibility(op, average).value
    # Averaging happens in framework code, so translate that to Sum for the actual call
    true_op = Sum if op == Average else op

    with tf.device(device_dense):
        byteps_size = tf.cast(size(), dtype=tensor.dtype)
        tensor_compressed, ctx = compression.compress(tensor)
        summed_tensor_compressed = _push_pull(tensor_compressed, scope)
        summed_tensor = compression.decompress(summed_tensor_compressed, ctx)
        if not enable_async:
            _div = tf.div if hasattr(tf, 'div') else tf.math.divide
            new_tensor = (_div(summed_tensor, byteps_size)
                          if op == Average else summed_tensor)
        else:  # no need to average for async training
            new_tensor = summed_tensor
    return new_tensor