def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [] # shape (num_tensors, num_gpus) output = [] for tensor_at_gpus in zip(*inputs): if tensor_at_gpus[0].is_sparse: result = reduce_add(tensor_at_gpus, destination) output.append(result) else: dense_tensors.append(tensor_at_gpus) itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)] for chunks in zip(*itrs): tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] result = reduce_add(tensors, destination) output.extend(_unflatten_dense_tensors(result, chunks[0])) return tuple(_reorder_tensors_as(output, inputs[0]))
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just # return `inputs`. dense_tensors: List[List] = [[] for _ in inputs ] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) # this will be sparse too output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] # (num_gpus,) flat_result = reduce_add(flat_tensors, destination) for t in _unflatten_dense_tensors(flat_result, chunks[0]): # The unflattened tensors do not share storage, and we don't expose # base flat tensor anyways, so give them different version counters. # See NOTE [ Version Counter in comm.*_coalesced ] output.append(t.data) return tuple(_reorder_tensors_as(output, ref_order))
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): if chunk[0].is_sparse: flat_indices, flat_values = _flatten_sparse_tensors(chunk) result_indices = broadcast(flat_indices, devices) result_values = broadcast(flat_values, devices) unflat_results = tuple( _unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values)) else: flat = _flatten_dense_tensors(chunk) results = broadcast(flat, devices) unflat_results = tuple( _unflatten_dense_tensors(tensor, chunk) for tensor in results) # use the broadcasted tensors for the remaining devices for dst, unflat_res in zip(outputs[1:], unflat_results[1:]): dst.extend(unflat_res) for i, output in enumerate(outputs): outputs[i] = _reorder_tensors_as(output, tensors) return tuple(outputs)
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): if chunk[0].is_sparse: flat_indices, flat_values = _flatten_sparse_tensors(chunk) result_indices = broadcast(flat_indices, devices) result_values = broadcast(flat_values, devices) unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values)) else: flat = _flatten_dense_tensors(chunk) results = broadcast(flat, devices) unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results) # use the broadcasted tensors for the remaining devices for dst, unflat_res in zip(outputs[1:], unflat_results[1:]): dst.extend(unflat_res) for i, output in enumerate(outputs): outputs[i] = _reorder_tensors_as(output, tensors) return tuple(outputs)
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [[] for _ in inputs] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] flat_result = reduce_add(flat_tensors, destination) output.extend(_unflatten_dense_tensors(flat_result, chunks[0])) return tuple(_reorder_tensors_as(output, ref_order))