def allgather(tensor, name=None): """An op which concatenates the input tensor with the same input tensor on all other Horovod processes. The concatenation is done on the first dimension, so the input tensors on the different processes must have the same rank and shape, except for the first dimension, which is allowed to be different. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero across all processes. The shape is identical to the input shape, except for the first dimension, which may be greater and is the sum of all first dimensions of the tensors in different Horovod processes. """ if name is None: name = 'HorovodAllgather_%s' % _normalize_name(tensor.name) # Ring allreduce transfer the same amount of data as allgather # so we're using allreduce instead n = size() k = rank() shape = tensor.shape.as_list() shapeL = tuple([k * shape[0]] + shape[1:]) shapeR = tuple([(n - k - 1) * shape[0]] + shape[1:]) left = tf.zeros(shapeL, dtype=tensor.dtype) right = tf.zeros(shapeR, dtype=tensor.dtype) concat = tf.concat([left, tensor, right], 0, name=name) return _allreduce(concat) # This function perfroms sum, not avg
def allreduce_async(tensor, average=True, name=None): """ A function that performs asynchronous averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. The reduction operation is keyed by the name. If name is not provided, an incremented auto-generated name is used. The tensor type and shape must be the same on all Horovod processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Arguments: tensor: A tensor to average and sum. average: A flag indicating whether to compute average or summation, defaults to average. name: A name of the reduction operation. Returns: A handle to the allreduce operation that can be used with `poll()` or `synchronize()`. """ if average: output = tensor.div(size()) return _allreduce_async(output, output, name) else: output = tensor.new(tensor.shape) return _allreduce_async(tensor, output, name)
def allreduce(tensor, average=True, device_dense='', device_sparse='', compression=Compression.none): """Perform an allreduce on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLREDUCE. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLGATHER. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. This function performs a bandwidth-optimal ring allreduce on the input tensor. If the input is an tf.IndexedSlices, the function instead does an allgather on the values and the indices, effectively doing an allreduce on the represented tensor. """ if isinstance(tensor, tf.IndexedSlices): with tf.device(device_sparse): # For IndexedSlices, do two allgathers intead of an allreduce. horovod_size = tf.cast(size(), tensor.values.dtype) values = allgather(tensor.values) indices = allgather(tensor.indices) # To make this operation into an average, divide all gathered values by # the Horovod size. new_values = tf.div(values, horovod_size) if average else values return tf.IndexedSlices(new_values, indices, dense_shape=tensor.dense_shape) else: with tf.device(device_dense): compressor = compression.get_compressor(tensor.dtype) horovod_size = tf.cast(size(), dtype=tensor.dtype) tensor_to_sum = tf.div(tensor, horovod_size) if average else tensor tensor_compressed = compressor.compress(tensor_to_sum) summed_tensor_compressed = _allreduce(tensor_compressed) new_tensor = compressor.decompress(summed_tensor_compressed) return new_tensor
def __init__(self, params, named_parameters=None, use_gpu=True, momentum=0.9, weight_decay=1e-4, use_allgather=True): super(self.__class__, self).__init__(params) if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._use_gpu = use_gpu self._use_nesterov = True self._momentum = momentum self._weight_decay = weight_decay self._debug = True #False self._use_allgather = use_allgather ##True #self._use_allgather = False##True # define U for residue, V for momentum if self._use_gpu: self._V = {k: torch.zeros(v.size()).cuda() for k, v in sorted(named_parameters)} self._U = {k: torch.zeros(v.size()).cuda() for k, v in sorted(named_parameters)} self._U = {k: torch.zeros(v.size()).cuda() for k, v in sorted(named_parameters)} self._masks = {k: torch.zeros(v.size()).cuda() for k, v in sorted(named_parameters)} self._compressed_msg = {k: torch.zeros(0).cuda() for k, v in sorted(named_parameters)} else: self._V = {k: torch.zeros(v.size()) for k, v in sorted(named_parameters)} self._U = {k: torch.zeros(v.size()) for k, v in sorted(named_parameters)} self._U = {k: torch.zeros(v.size()) for k, v in sorted(named_parameters)} self._masks = {k: torch.zeros(v.size()) for k, v in sorted(named_parameters)} self._compressed_msg = {k: torch.zeros(0) for k, v in sorted(named_parameters)} self._compressed_len= {k: torch.zeros(0, dtype=torch.long) for k, v in sorted(named_parameters)} self._compressed_msg_size = {k: 0 for k, v in sorted(named_parameters)} self._v_ref = {k: [] for k, v in sorted(named_parameters)} self._handles = {} self._grad_accs = [] if size() > 1: self._register_hooks()
def backward(ctx, grad_output): grad_reduced = allreduce(grad_output, average=False) dim_t = torch.IntTensor([ctx.dim]) dim = allgather(dim_t).view(size()) r = rank() offset = torch.sum(dim.narrow(0, 0, r)).data[0] if r != 0 else 0 return grad_reduced.narrow(0, offset, ctx.dim), None
def allreduce(tensor, average=True, device_dense='', device_sparse=''): """Perform an allreduce on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLREDUCE. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLGATHER. This function performs a bandwidth-optimal ring allreduce on the input tensor. If the input is an tf.IndexedSlices, the function instead does an allgather on the values and the indices, effectively doing an allreduce on the represented tensor. """ if isinstance(tensor, tf.IndexedSlices): with tf.device(device_sparse): # For IndexedSlices, do two allgathers intead of an allreduce. horovod_size = tf.cast(size(), tensor.values.dtype) values = allgather(tensor.values) indices = allgather(tensor.indices) # To make this operation into an average, divide all gathered values by # the Horovod size. new_values = tf.div(values, horovod_size) if average else values return tf.IndexedSlices(new_values, indices, dense_shape=tensor.dense_shape) else: with tf.device(device_dense): horovod_size = tf.cast(size(), tensor.dtype) summed_tensor = _allreduce(tensor) new_tensor = (tf.div(summed_tensor, horovod_size) if average else summed_tensor) return new_tensor
def reduce_gradients(grads_and_vars, on_horovod): if on_horovod: from horovod.common import size from horovod.tensorflow import allreduce if size() > 1: averaged_grads_and_vars = [] with tf.name_scope("all_reduce"): for grad, var in grads_and_vars: if grad is not None: avg_grad = allreduce(grad) averaged_grads_and_vars.append((avg_grad, var)) else: averaged_grads_and_vars.append((None, var)) return averaged_grads_and_vars else: return grads_and_vars else: raise NotImplementedError("Reduce in tower-mode is not implemented.")
def __init__(self, params, named_parameters=None): super(self.__class__, self).__init__(params) if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._handles = {} self._grad_accs = [] if size() > 1: self._register_hooks()
def _allgather_grad(op, grad): """Gradient for allgather op. Args: op: An operation. grad: `Tensor` gradient with respect to the output of the op. Returns: The gradient with respect to the input of the op. """ grad = _allreduce(grad) x = op.inputs[0] d0 = x.get_shape().as_list()[0] d = tf.convert_to_tensor([d0], dtype=tf.int32) s = size() d = tf.reshape(allgather(d), [s]) splits = tf.split(grad, num_or_size_splits=d, axis=0) return splits[rank()]
def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) if size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Allreduce"): for grad, var in gradients: if grad is not None: avg_grad = allreduce(grad, device_dense=self._device_dense, device_sparse=self._device_sparse) averaged_gradients.append((avg_grad, var)) else: averaged_gradients.append((None, var)) return averaged_gradients else: return gradients
def test_horovod_size(): """Test that the size returned by hvd.size() is correct.""" _, true_size = mpi_env_rank_and_size() hvd.init() size = hvd.size() assert true_size == size