def push_pull(tensor, scope='', average=True, device_dense='', device_sparse='', compression=Compression.none, enable_async=False): """Perform an push_pull on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. scope: the graph name scope average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default. device_sparse: Device to be used for sparse tensors. Uses GPU by default. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ with tf.device(device_dense): byteps_size = tf.cast(size(), dtype=tensor.dtype) tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = _push_pull(tensor_compressed, scope) summed_tensor = compression.decompress(summed_tensor_compressed, ctx) if not enable_async: new_tensor = (tf.div(summed_tensor, byteps_size) if average else summed_tensor) else: # no need to average for async training new_tensor = summed_tensor return new_tensor
def gradient(self, target, sources, output_gradients=None): gradients = super(self.__class__, self).gradient(target, sources, output_gradients) if size() > 1: avg_grads = self._push_pull_grads(gradients) return avg_grads else: return gradients
def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also push_pull the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) if size() > 1 and not self._enable_async: grads, vars = zip(*gradients) avg_grads = self._push_pull_grads(grads) return list(zip(avg_grads, vars)) else: return gradients
def broadcast_variables(variables, root_rank, scope=''): """Broadcasts variables from root rank to all other processes. Arguments: variables: variables for broadcast root_rank: rank of the process from which global variables will be broadcasted to all other processes. scope: the graph name scope """ if size() <= 1: return variables _assign = tf.assign if hasattr(tf, 'assign') else tf.compat.v1.assign return tf.group( *[_assign(var, broadcast(var, root_rank, scope)) for var in variables])
def push_pull(tensor, scope='', average=None, device_dense='', device_sparse='', compression=Compression.none, op=None, enable_async=False): """Perform an push_pull on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. average: .. warning:: .. deprecated Use `op` instead. Will be removed. scope: the graph name scope average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default. device_sparse: Device to be used for sparse tensors. Uses GPU by default. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. op: The reduction operation to combine tensors across different ranks. Defaults to Average if None is given. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ op = handle_average_backwards_compatibility(op, average).value # Averaging happens in framework code, so translate that to Sum for the actual call true_op = Sum if op == Average else op with tf.device(device_dense): byteps_size = tf.cast(size(), dtype=tensor.dtype) tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = _push_pull(tensor_compressed, scope) summed_tensor = compression.decompress(summed_tensor_compressed, ctx) if not enable_async: _div = tf.div if hasattr(tf, 'div') else tf.math.divide new_tensor = (_div(summed_tensor, byteps_size) if op == Average else summed_tensor) else: # no need to average for async training new_tensor = summed_tensor return new_tensor