def broadcast(tensor, root_rank, scope='', name=None, is_variable=True): """An op which broadcasts the input tensor on root rank to the same input tensor on all other BytePS processes. The broadcast operation is keyed by the name of the op. The tensor type and shape must be the same on all BytePS processes for a given name. The broadcast will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ # Broadcast is implemented as push + pull after zero-ing non-root tensors if name is None and not _executing_eagerly(): name = 'BytePSBroadcast_%s' % _normalize_name(tensor.name) if scope == '' and not _executing_eagerly(): if 'v1' in dir(tf.compat): scope = tf.compat.v1.get_default_graph().get_name_scope() else: scope = tf.get_default_graph().get_name_scope() if scope != '': scope += '/' full_name = scope + name full_name = full_name.encode("ascii") TF_LIB_CTYPES.byteps_tensorflow_declare_tensor(ctypes.c_char_p(full_name)) if root_rank != rank(): if is_variable: with tf.control_dependencies([tf.assign_sub(tensor, tensor)]): return C_LIB.byteps_push_pull(tensor, name=name) else: with tf.device(tensor.device): input_tensor = tf.zeros_like(tensor) return C_LIB.byteps_push_pull(input_tensor, name=name) else: return C_LIB.byteps_push_pull(tensor, name=name)
def _push_pull(tensor, scope='', name=None): """An op which sums an input tensor over all the BytePS processes. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all BytePS processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ if name is None and not _executing_eagerly(): name = 'BytePSPushPull_%s' % _normalize_name(tensor.name) if scope == '' and not _executing_eagerly(): if 'v1' in dir(tf.compat): scope = tf.compat.v1.get_default_graph().get_name_scope() else: scope = tf.get_default_graph().get_name_scope() if scope != '': scope += '/' if not name: name = '' full_name = scope + name if not full_name: full_name = "empty_name_" + randomString() full_name_ascii = full_name.encode("ascii") TF_LIB_CTYPES.byteps_tensorflow_declare_tensor( ctypes.c_char_p(full_name_ascii)) return C_LIB.byteps_push_pull(tensor, name=name, input_name=full_name)
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False): """Construct a new DistributedOptimizer, which uses another optimizer under the hood for computing single-process gradient values and applying gradient updates after the gradient values have been averaged across all the BytePS ranks. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. use_locking: Whether to use locking when updating variables. See Optimizer.__init__ for more info. device_dense: Device to be used for dense tensors. Uses GPU by default. device_sparse: Device to be used for sparse tensors. Uses GPU by default. compression: Compression algorithm used during push_pull to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. """ if name is None: name = "Distributed{}".format(type(optimizer).__name__) self._optimizer = optimizer self._device_dense = device_dense self._device_sparse = device_sparse self._compression = compression self._sparse_as_dense = sparse_as_dense def push_pull_grads(grads): with tf.name_scope(self._name + "_Push_Pull") as scope: if self._sparse_as_dense: grads = [tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads] return [push_pull(grad, scope, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression) if grad is not None else grad for grad in grads] if _executing_eagerly(): self._push_pull_grads = tf.contrib.eager.defun(push_pull_grads) else: self._push_pull_grads = push_pull_grads super(DistributedOptimizer, self).__init__( name=name, use_locking=use_locking)
def __init__(self, optimizer, name=None, use_locking=False, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, op=Average): if name is None: name = "Distributed{}".format(type(optimizer).__name__) super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._device_dense = device_dense self._device_sparse = device_sparse self._compression = compression self._sparse_as_dense = sparse_as_dense self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0) if self._enable_async: assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \ "Async is only valid for distributed training" print('BytePS: enable asynchronous training') def push_pull_grads(grads): with tf.name_scope(self._name + "_Push_Pull") as scope: if self._sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] return [ push_pull(grad, scope, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression, enable_async=self._enable_async) if grad is not None else grad for grad in grads ] if _executing_eagerly(): self._push_pull_grads = tf.contrib.eager.defun(push_pull_grads) else: self._push_pull_grads = push_pull_grads
def _push_pull(tensor, scope='', name=None): """An op which sums an input tensor over all the BytePS processes. The reduction operation is keyed by the name of the op. The tensor type and shape must be the same on all BytePS processes for a given name. The reduction will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ if name is None and not _executing_eagerly(): name = 'BytePSPushPull_%s' % _normalize_name(tensor.name) TF_LIB_CTYPES.byteps_tensorflow_declare_tensor( ctypes.c_char_p(scope + name)) return C_LIB.byteps_push_pull(tensor, name=name)
def broadcast_global_variables(root_rank): """Broadcasts all global variables from root rank to all other processes. **NOTE:** deprecated in TensorFlow 2.0. Arguments: root_rank: rank of the process from which global variables will be broadcasted to all other processes. """ if _executing_eagerly(): raise RuntimeError( "bps.broadcast_global_variables() does not support eager execution. " "Please use `bps.broadcast_variables(<model/optimizer variables>)` instead." ) return broadcast_variables(_global_variables(), root_rank)
def broadcast(tensor, root_rank, name=None, is_variable=True): """An op which broadcasts the input tensor on root rank to the same input tensor on all other BytePS processes. The broadcast operation is keyed by the name of the op. The tensor type and shape must be the same on all BytePS processes for a given name. The broadcast will not start until all processes are ready to send and receive the tensor. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ # Broadcast is implemented as push + pull after zero-ing non-root tensors if name is None and not _executing_eagerly(): name = 'BytePSBroadcast_%s' % _normalize_name(tensor.name) TF_LIB_CTYPES.byteps_tensorflow_declare_tensor(ctypes.c_char_p(name)) if is_variable and (root_rank != rank()): return C_LIB.byteps_push_pull(tensor.assign(tf.zeros_like(tensor)), name=name) else: # TODO: needs to zero-out non-variable tensors, too return C_LIB.byteps_push_pull(tensor, name=name)