Пример #1
0
def _get_size_helper(group, backend):
    """
    The Helper to do get_rank_size.

    Args:
        group (str): The communication group.
        backend (str): The backend, like "hccl".

    Raises:
        ValueError: If backend is invalid.

    Returns:
        Integer. The rank size of specified group.
    """
    size = None
    if _is_role_pserver() or _is_role_sched():
        size = 1
        return size
    if backend == Backend.HCCL:
        if group == HCCL_WORLD_COMM_GROUP:
            size = hccl.get_rank_size()
        else:
            size = hccl.get_rank_size(group)
    elif backend == Backend.NCCL:
        size = mpi.get_rank_size(group)
    else:
        raise ValueError("Invalid backend: '{}'".format(backend))
    return size
Пример #2
0
def _get_rank_helper(group, backend):
    """
    The Helper to do get_rank_id.

    Args:
        group (str): The communication group.
        backend (str): The backend, like "hccl".

    Raises:
        ValueError: If backend is invalid.

    Returns:
        Integer. The local rank id of the calling process.
    """
    rank_id = None
    if _is_role_pserver() or _is_role_sched():
        rank_id = 0
        return rank_id
    if backend == Backend.HCCL:
        if group == HCCL_WORLD_COMM_GROUP:
            rank_id = hccl.get_rank_id()
        else:
            rank_id = hccl.get_rank_id(group)
    elif backend == Backend.NCCL:
        rank_id = mpi.get_rank_id(group)
    else:
        raise ValueError("Invalid backend: '{}'".format(backend))
    return rank_id
Пример #3
0
    def wrapper(*args, **kargs):
        if _is_role_pserver() or _is_role_sched():
            return func(*args, **kargs)
        if not GlobalComm.INITED:
            raise RuntimeError("Distributed Communication has not been inited")
        group = None
        if "group" in kargs.keys():
            group = kargs.get("group")
            if group is not None and not isinstance(group, str):
                raise TypeError("Group should be str or None, "
                                "but got group {}".format(type(group)))

        if "backend" in kargs.keys():
            backend = kargs.get("backend")
            if backend is Backend.HCCL and not is_hccl_available():
                raise RuntimeError(
                    "Distributed Communication doesn't have HCCL built in")
            if backend is Backend.NCCL and not is_nccl_available():
                raise RuntimeError(
                    "Distributed Communication doesn't have NCCL built in")

        if group is None:
            if backend is Backend.HCCL:
                group = HCCL_WORLD_COMM_GROUP
            elif backend is Backend.NCCL:
                group = NCCL_WORLD_COMM_GROUP
        return func(*args, **kargs)
Пример #4
0
def init(backend_name=None):
    """
    Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service.

    Note:
        The full name of HCCL is Huawei Collective Communication Library.
        The full name of NCCL is NVIDIA Collective Communication Library.
        This method should be used after set_context.

    Args:
        backend_name (str): Backend, using HCCL/NCCL. if not been set, infer it by device_target. Default: None.

    Raises:
        TypeError: If `backend_name` is not a string.
        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails,
                      or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH
                      have not been exported when backend is HCCL.
        ValueError: If the environment variable RANK_ID has not been exported as a number.

    Examples:
        >>> from mindspore.context import set_context
        >>> set_context(device_target="Ascend")
        >>> init()
    """
    if _is_role_pserver() or _is_role_sched():
        return
    device_target = context.get_context("device_target")
    if backend_name is None:
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported in parallel initialization, "
                "please use Ascend or GPU.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        if device_target != "Ascend":
            raise RuntimeError(
                "Device target should be 'Ascend' to init hccl, but got {}".
                format(device_target))
        _check_parallel_envs()
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))
Пример #5
0
def init(backend_name=None):
    """
    Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service.

    Note:
        The full name of HCCL is Huawei Collective Communication Library.
        The full name of NCCL is NVIDIA Collective Communication Library.

    Args:
        backend_name (str): Backend.

    Raises:
        TypeError: If `backend_name` is not a string.
        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
    """
    if _is_role_pserver() or _is_role_sched():
        return
    device_target = context.get_context("device_target")
    if backend_name is None:
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        if device_target != "Ascend":
            raise RuntimeError(
                "Device target should be 'Ascend' to init hccl, but got {}".
                format(device_target))
        _check_parallel_envs()
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))
Пример #6
0
def init(backend_name=None):
    """
    Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used.

    Note:
        The full name of hccl is Huawei Collective Communication Library.
        The full name of nccl is NVIDIA Collective Communication Library.

    Args:
        backend_name (str): Backend.

    Raises:
        TypeError: If backen_name is not a string.
        RuntimeError: If device target is invalid.
        RuntimeError: If backend is invalid or distributed init fails.
    """
    if _is_role_pserver() or _is_role_sched():
        return
    if backend_name is None:
        device_target = context.get_context("device_target")
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))