Exemplo n.º 1
0
def get_closest_net_devices(gpu_dev):
    """
    Get the names of the closest net devices to `gpu_dev`

    Parameters
    ----------
    gpu_dev : str
        GPU device id

    Returns
    -------
    dev_names : str
        Names of the closest net devices

    Examples
    --------
    >>> get_closest_net_devices(0)
    'eth0'
    """
    from ucp._libs.topological_distance import TopologicalDistance

    dev = int(gpu_dev)
    net_dev = ""
    td = TopologicalDistance()
    ibs = td.get_cuda_distances_from_device_index(dev, "openfabrics")
    if len(ibs) > 0:
        net_dev += ibs[0]["name"] + ":1,"
    ifnames = td.get_cuda_distances_from_device_index(dev, "network")
    if len(ifnames) > 0:
        net_dev += ifnames[0]["name"]
    return net_dev
Exemplo n.º 2
0
def get_ucx_net_devices(cuda_device_index, ucx_net_devices):
    if cuda_device_index is None and (callable(ucx_net_devices)
                                      or ucx_net_devices == "auto"):
        raise ValueError(
            "A CUDA device index must be specified if the "
            "ucx_net_devices variable is either callable or 'auto'")
    elif cuda_device_index is not None:
        dev = int(cuda_device_index)

    net_dev = None
    if callable(ucx_net_devices):
        net_dev = ucx_net_devices(int(cuda_device_index))
    elif isinstance(ucx_net_devices, str):
        if ucx_net_devices == "auto":
            # If TopologicalDistance from ucp is available, we set the UCX
            # net device to the closest network device explicitly.
            from ucp._libs.topological_distance import TopologicalDistance

            net_dev = ""
            td = TopologicalDistance()
            ibs = td.get_cuda_distances_from_device_index(dev, "openfabrics")
            if len(ibs) > 0:
                net_dev += ibs[0]["name"] + ":1,"
            ifnames = td.get_cuda_distances_from_device_index(dev, "network")
            if len(ifnames) > 0:
                net_dev += ifnames[0]["name"]
        else:
            net_dev = ucx_net_devices
    return net_dev
Exemplo n.º 3
0
def test_topological_distance_dgx():
    if not os.path.isfile("/etc/dgx-release"):
        pytest.skip("This test can only be executed on an NVIDIA DGX Server")

    dgx_server = None
    for line in open("/etc/dgx-release"):
        if line.startswith("DGX_PLATFORM"):
            if "DGX Server for DGX-1" in line:
                dgx_server = 1
            elif "DGX Server for DGX-2" in line:
                dgx_server = 2
            break

    pynvml.nvmlInit()
    dev_count = pynvml.nvmlDeviceGetCount()

    dgx_network = ["ib" + str(i // 2) for i in range(dev_count)]
    if dgx_server == 1:
        dgx_openfabrics = [
            "mlx5_0",
            "mlx5_0",
            "mlx5_1",
            "mlx5_1",
            "mlx5_2",
            "mlx5_2",
            "mlx5_3",
            "mlx5_3",
        ]
    elif dgx_server == 2:
        dgx_openfabrics = [
            "mlx5_0",
            "mlx5_0",
            "mlx5_1",
            "mlx5_1",
            "mlx5_2",
            "mlx5_2",
            "mlx5_3",
            "mlx5_3",
            "mlx5_6",
            "mlx5_6",
            "mlx5_7",
            "mlx5_7",
            "mlx5_8",
            "mlx5_8",
            "mlx5_9",
            "mlx5_9",
        ]
    else:
        pytest.skip("DGX Server not recognized or not supported")

    td = TopologicalDistance()

    for i in range(dev_count):
        closest_network = td.get_cuda_distances_from_device_index(i, "network")
        closest_openfabrics = td.get_cuda_distances_from_device_index(
            i, "openfabrics")

        assert dgx_network[i] == closest_network[0]["name"]
        assert dgx_openfabrics[i] == closest_openfabrics[0]["name"]
def get_environment_variables(cuda_device_index):
    env = os.environ.copy()

    env["CUDA_VISIBLE_DEVICES"] = str(cuda_device_index)

    tls = env.get("UCX_TLS")
    if tls is not None and "rc" in tls:
        td = TopologicalDistance()
        closest_openfabrics = td.get_cuda_distances_from_device_index(
            cuda_device_index, "openfabrics")
        env["UCX_NET_DEVICES"] = closest_openfabrics[0]["name"] + ":1"

    return env
Exemplo n.º 5
0
def _ucx_net_devices(dev, ucx_net_devices):
    dev = int(dev)
    net_dev = None
    if callable(ucx_net_devices):
        net_dev = ucx_net_devices(dev)
    elif isinstance(ucx_net_devices, str):
        if ucx_net_devices == "auto":
            # If TopologicalDistance from ucp is available, we set the UCX
            # net device to the closest network device explicitly.
            from ucp._libs.topological_distance import TopologicalDistance

            net_dev = ""
            td = TopologicalDistance()
            ibs = td.get_cuda_distances_from_device_index(dev, "openfabrics")
            if len(ibs) > 0:
                net_dev += ibs[0]["name"] + ":1,"
            ifnames = td.get_cuda_distances_from_device_index(dev, "network")
            if len(ifnames) > 0:
                net_dev += ifnames[0]["name"]
        else:
            net_dev = ucx_net_devices
    return net_dev