예제 #1
0
 def wrapper(state, *args, **kwargs):
     try:
         return func(state, *args, **kwargs)
     except UnknownError as e:
         if 'HorovodAllreduce' in e.message or \
                 'HorovodAllgather' in e.message or \
                 'HorovodBroadcast' in e.message:
             raise HorovodInternalError(e)
예제 #2
0
def _allgather_async(tensor, output, name):
    function = _check_function(_allgather_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, name.encode() if name is not None else _NULL)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle
예제 #3
0
파일: mpi_ops.py 프로젝트: rongou/horovod
def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet):
    function = _check_function(_broadcast_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, root_rank, name.encode() if name is not None else _NULL,
            process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle
예제 #4
0
파일: mpi_ops.py 프로젝트: raajay/horovod
def _allreduce_async(tensor, output, name, op, prescale_factor,
                     postscale_factor, process_set: ProcessSet):
    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        if rocm_built():
            # For ROCm, perform averaging at framework level
            divisor = size()
            op = Sum
        else:
            divisor = 1

    elif op == Adasum:
        if process_set != global_process_set:
            raise NotImplementedError(
                "Adasum does not support non-global process sets yet.")
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                if rocm_built():
                    # For ROCm, perform averaging at framework level
                    divisor = local_size()
                else:
                    divisor = 1
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, op, prescale_factor,
            postscale_factor, process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle
예제 #5
0
def _alltoall_async(tensor, splits, output, output_received_splits, name):
    if splits is None:
        # If splits not provided, create empty tensor as placeholder
        splits = torch.tensor([], dtype=torch.int32, device='cpu')
    elif not isinstance(splits, torch.Tensor):
        splits = torch.tensor(splits, dtype=torch.int32, device='cpu')
    function = _check_function(_alltoall_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, splits, (output, output_received_splits))
    return handle
예제 #6
0
def join(device=-1):
    """A function that indicates that the rank finished processing data.

    All ranks that did not call join() continue to process allreduce operations.
    This function blocks Python thread until all ranks join.

    Arguments:
        device: An id of the device to create temprorary zero tensors (default -1, CPU)

    Returns:
        Id of the rank that joined last.
    """
    try:
        return mpi_lib.horovod_torch_join(device)
    except RuntimeError as e:
        raise HorovodInternalError(e)
예제 #7
0
def _allreduce_async(tensor, output, name, op):
    if tensor.dtype == torch.float16 and not _fp16_supported:
        raise NotImplementedError(
            'float16 allreduce is not supported for PyTorch version {} < 1.0.0'
            .format(torch.__version__))

    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        divisor = size()
    elif op == Adasum:
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                divisor = local_size()
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1
    # Averaging happens in framework code, so translate that to Sum for the actual call
    true_op = Sum if op == Average else op

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, true_op)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle
예제 #8
0
파일: mpi_ops.py 프로젝트: rongou/horovod
def barrier(process_set=global_process_set):
    """
    A function that acts as a simple sychronization point for ranks specified
    in the given process group(default to global group). Ranks that reach
    this function call will stall until all other ranks have reached.

    Arguments:
        process_set: Process set object to limit this operation to a subset of
                     Horovod processes. Default is the global process set.
    """

    try:
        handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)

    _handle_map[handle] = (None, None)

    synchronize(handle)
예제 #9
0
def synchronize(handle):
    """
    Synchronizes an asynchronous allreduce, allgather, alltoall or broadcast operation until
    it's completed. Returns the result of the operation.

    Arguments:
        handle: A handle returned by an allreduce, allgather, alltoall or broadcast asynchronous
                operation.

    Returns:
        A single output tensor of the operation or a tuple of multiple output tensors.
    """
    if handle not in _handle_map:
        return

    try:
        mpi_lib.horovod_torch_wait_and_clear(handle)
        output = _handle_map.pop(handle)[-1]
        return output
    except RuntimeError as e:
        raise HorovodInternalError(e)
예제 #10
0
파일: mpi_ops.py 프로젝트: rongou/horovod
def join(device=-1) -> int:
    """A function that indicates that the rank finished processing data.

    All ranks that did not call join() continue to process allreduce operations.
    This function blocks Python thread until all ranks join.

    Arguments:
        device: An id of the device to create temprorary zero tensors (default -1, CPU)

    Returns:
        Id of the rank that joined last.
    """
    output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu"))
    try:
        handle = mpi_lib.horovod_torch_join(output, device)
    except RuntimeError as e:
        raise HorovodInternalError(e)

    _handle_map[handle] = (None, output)

    return synchronize(handle).item()