Python HorovodInternalError 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: horovod.common.exceptions

클래스/타입: HorovodInternalError

hotexamples.com에서의 예제들: 10

Python HorovodInternalError - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 horovod.common.exceptions.HorovodInternalError에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HorovodInternalError(10)

자주 사용되는 메소드들

HorovodInternalError (10)

예제 #1

파일 보기

 def wrapper(state, *args, **kwargs):
     try:
         return func(state, *args, **kwargs)
     except UnknownError as e:
         if 'HorovodAllreduce' in e.message or \
                 'HorovodAllgather' in e.message or \
                 'HorovodBroadcast' in e.message:
             raise HorovodInternalError(e)

예제 #2

파일 보기

def _allgather_async(tensor, output, name):
    function = _check_function(_allgather_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, name.encode() if name is not None else _NULL)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle

예제 #3

파일 보기

파일: mpi_ops.py 프로젝트: rongou/horovod

def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet):
    function = _check_function(_broadcast_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, root_rank, name.encode() if name is not None else _NULL,
            process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle

예제 #4

파일 보기

파일: mpi_ops.py 프로젝트: raajay/horovod

def _allreduce_async(tensor, output, name, op, prescale_factor,
                     postscale_factor, process_set: ProcessSet):
    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        if rocm_built():
            # For ROCm, perform averaging at framework level
            divisor = size()
            op = Sum
        else:
            divisor = 1

    elif op == Adasum:
        if process_set != global_process_set:
            raise NotImplementedError(
                "Adasum does not support non-global process sets yet.")
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                if rocm_built():
                    # For ROCm, perform averaging at framework level
                    divisor = local_size()
                else:
                    divisor = 1
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, op, prescale_factor,
            postscale_factor, process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle

예제 #5

파일 보기

파일: mpi_ops.py 프로젝트: mohithashok/horovod

def _alltoall_async(tensor, splits, output, output_received_splits, name):
    if splits is None:
        # If splits not provided, create empty tensor as placeholder
        splits = torch.tensor([], dtype=torch.int32, device='cpu')
    elif not isinstance(splits, torch.Tensor):
        splits = torch.tensor(splits, dtype=torch.int32, device='cpu')
    function = _check_function(_alltoall_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, splits, (output, output_received_splits))
    return handle

예제 #6

파일 보기

def join(device=-1):
    """A function that indicates that the rank finished processing data.

    All ranks that did not call join() continue to process allreduce operations.
    This function blocks Python thread until all ranks join.

    Arguments:
        device: An id of the device to create temprorary zero tensors (default -1, CPU)

    Returns:
        Id of the rank that joined last.
    """
    try:
        return mpi_lib.horovod_torch_join(device)
    except RuntimeError as e:
        raise HorovodInternalError(e)

예제 #7

파일 보기

파일: mpi_ops.py 프로젝트: fightseed/horovod-1

def _allreduce_async(tensor, output, name, op):
    if tensor.dtype == torch.float16 and not _fp16_supported:
        raise NotImplementedError(
            'float16 allreduce is not supported for PyTorch version {} < 1.0.0'
            .format(torch.__version__))

    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        divisor = size()
    elif op == Adasum:
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                divisor = local_size()
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1
    # Averaging happens in framework code, so translate that to Sum for the actual call
    true_op = Sum if op == Average else op

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, true_op)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle

예제 #8

파일 보기

파일: mpi_ops.py 프로젝트: rongou/horovod

def barrier(process_set=global_process_set):
    """
    A function that acts as a simple sychronization point for ranks specified
    in the given process group(default to global group). Ranks that reach
    this function call will stall until all other ranks have reached.

    Arguments:
        process_set: Process set object to limit this operation to a subset of
                     Horovod processes. Default is the global process set.
    """

    try:
        handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)

    _handle_map[handle] = (None, None)

    synchronize(handle)

예제 #9

파일 보기

def synchronize(handle):
    """
    Synchronizes an asynchronous allreduce, allgather, alltoall or broadcast operation until
    it's completed. Returns the result of the operation.

    Arguments:
        handle: A handle returned by an allreduce, allgather, alltoall or broadcast asynchronous
                operation.

    Returns:
        A single output tensor of the operation or a tuple of multiple output tensors.
    """
    if handle not in _handle_map:
        return

    try:
        mpi_lib.horovod_torch_wait_and_clear(handle)
        output = _handle_map.pop(handle)[-1]
        return output
    except RuntimeError as e:
        raise HorovodInternalError(e)

예제 #10

파일 보기

파일: mpi_ops.py 프로젝트: rongou/horovod

def join(device=-1) -> int:
    """A function that indicates that the rank finished processing data.

    All ranks that did not call join() continue to process allreduce operations.
    This function blocks Python thread until all ranks join.

    Arguments:
        device: An id of the device to create temprorary zero tensors (default -1, CPU)

    Returns:
        Id of the rank that joined last.
    """
    output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu"))
    try:
        handle = mpi_lib.horovod_torch_join(output, device)
    except RuntimeError as e:
        raise HorovodInternalError(e)

    _handle_map[handle] = (None, output)

    return synchronize(handle).item()