def _get_size_helper(group, backend): """ The Helper to do get_rank_size. Args: group (str): The communication group. backend (str): The backend, like "hccl". Raises: ValueError: If backend is invalid. Returns: Integer. The rank size of specified group. """ size = None if _is_role_pserver() or _is_role_sched(): size = 1 return size if backend == Backend.HCCL: if group == HCCL_WORLD_COMM_GROUP: size = hccl.get_rank_size() else: size = hccl.get_rank_size(group) elif backend == Backend.NCCL: size = mpi.get_rank_size(group) else: raise ValueError("Invalid backend: '{}'".format(backend)) return size
def _get_rank_helper(group, backend): """ The Helper to do get_rank_id. Args: group (str): The communication group. backend (str): The backend, like "hccl". Raises: ValueError: If backend is invalid. Returns: Integer. The local rank id of the calling process. """ rank_id = None if _is_role_pserver() or _is_role_sched(): rank_id = 0 return rank_id if backend == Backend.HCCL: if group == HCCL_WORLD_COMM_GROUP: rank_id = hccl.get_rank_id() else: rank_id = hccl.get_rank_id(group) elif backend == Backend.NCCL: rank_id = mpi.get_rank_id(group) else: raise ValueError("Invalid backend: '{}'".format(backend)) return rank_id
def wrapper(*args, **kargs): if _is_role_pserver() or _is_role_sched(): return func(*args, **kargs) if not GlobalComm.INITED: raise RuntimeError("Distributed Communication has not been inited") group = None if "group" in kargs.keys(): group = kargs.get("group") if group is not None and not isinstance(group, str): raise TypeError("Group should be str or None, " "but got group {}".format(type(group))) if "backend" in kargs.keys(): backend = kargs.get("backend") if backend is Backend.HCCL and not is_hccl_available(): raise RuntimeError( "Distributed Communication doesn't have HCCL built in") if backend is Backend.NCCL and not is_nccl_available(): raise RuntimeError( "Distributed Communication doesn't have NCCL built in") if group is None: if backend is Backend.HCCL: group = HCCL_WORLD_COMM_GROUP elif backend is Backend.NCCL: group = NCCL_WORLD_COMM_GROUP return func(*args, **kargs)
def init(backend_name=None): """ Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service. Note: The full name of HCCL is Huawei Collective Communication Library. The full name of NCCL is NVIDIA Collective Communication Library. This method should be used after set_context. Args: backend_name (str): Backend, using HCCL/NCCL. if not been set, infer it by device_target. Default: None. Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH have not been exported when backend is HCCL. ValueError: If the environment variable RANK_ID has not been exported as a number. Examples: >>> from mindspore.context import set_context >>> set_context(device_target="Ascend") >>> init() """ if _is_role_pserver() or _is_role_sched(): return device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported in parallel initialization, " "please use Ascend or GPU.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": if device_target != "Ascend": raise RuntimeError( "Device target should be 'Ascend' to init hccl, but got {}". format(device_target)) _check_parallel_envs() init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP GlobalComm.INITED = True elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP GlobalComm.INITED = True else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))
def init(backend_name=None): """ Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service. Note: The full name of HCCL is Huawei Collective Communication Library. The full name of NCCL is NVIDIA Collective Communication Library. Args: backend_name (str): Backend. Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails. """ if _is_role_pserver() or _is_role_sched(): return device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": if device_target != "Ascend": raise RuntimeError( "Device target should be 'Ascend' to init hccl, but got {}". format(device_target)) _check_parallel_envs() init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP GlobalComm.INITED = True elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP GlobalComm.INITED = True else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))
def init(backend_name=None): """ Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used. Note: The full name of hccl is Huawei Collective Communication Library. The full name of nccl is NVIDIA Collective Communication Library. Args: backend_name (str): Backend. Raises: TypeError: If backen_name is not a string. RuntimeError: If device target is invalid. RuntimeError: If backend is invalid or distributed init fails. """ if _is_role_pserver() or _is_role_sched(): return if backend_name is None: device_target = context.get_context("device_target") if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))