async def test_local_cuda_cluster(): async with LocalCUDACluster(asynchronous=True) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == utils.get_n_gpus() # CUDA_VISIBLE_DEVICES cycles properly def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] result = await client.run(get_visible_devices) assert all( len(v.split(",")) == utils.get_n_gpus() for v in result.values()) for i in range(utils.get_n_gpus()): assert {int(v.split(",")[i]) for v in result.values() } == set(range(utils.get_n_gpus())) # Use full memory assert sum(w.memory_limit for w in cluster.workers.values()) == TOTAL_MEMORY for w, devices in result.items(): ident = devices[0] assert int(ident) == cluster.scheduler.workers[w].name with pytest.raises(ValueError): cluster.scale(1000)
async def test_local_cuda_cluster(): async with LocalCUDACluster(scheduler_port=0, asynchronous=True, device_memory_limit=1) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == utils.get_n_gpus() # CUDA_VISIBLE_DEVICES cycles properly def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] result = await client.run(get_visible_devices) assert all( len(v.split(",")) == utils.get_n_gpus() for v in result.values()) for i in range(utils.get_n_gpus()): assert {int(v.split(",")[i]) for v in result.values() } == set(range(utils.get_n_gpus())) # Use full memory, checked with some buffer to ignore rounding difference full_mem = sum(w.memory_limit for w in cluster.workers.values()) assert full_mem >= MEMORY_LIMIT - 1024 and full_mem < MEMORY_LIMIT + 1024 for w, devices in result.items(): ident = devices.split(",")[0] assert int(ident) == cluster.scheduler.workers[w].name with pytest.raises(ValueError): cluster.scale(1000)
def test_get_n_gpus(): assert isinstance(get_n_gpus(), int) try: os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2" assert get_n_gpus() == 3 finally: del os.environ["CUDA_VISIBLE_DEVICES"]
def test_cpu_affinity_and_cuda_visible_devices(): affinity = dict() for i in range(get_n_gpus()): # The negative here would be `device = 0` as required for CUDA runtime # calls. device = nvml_device_index(0, cuda_visible_devices(i)) affinity[device] = get_cpu_affinity(device) for i in range(get_n_gpus()): assert get_cpu_affinity(i) == affinity[i]
def test_get_device_total_memory(): for i in range(get_n_gpus()): with cuda.gpus[i]: assert ( get_device_total_memory(i) == cuda.current_context().get_memory_info()[1] )
def test_get_ucx_net_devices_auto(): pytest.importorskip("ucp") for idx in range(get_n_gpus()): # Since the actual device is system-dependent, we just check that # this function call doesn't fail. If any InfiniBand devices are # available, it will return that, otherwise an empty string. get_ucx_net_devices(idx, "auto")
def setup_rmm_pool(request, pytestconfig): if pytestconfig.getoption('--use-rmm-pool'): if not has_rmm(): raise ImportError('The --use-rmm-pool option requires the RMM package') import rmm from dask_cuda.utils import get_n_gpus rmm.reinitialize(pool_allocator=True, initial_pool_size=1024*1024*1024, devices=list(range(get_n_gpus())))
async def test_local_cuda_cluster(): async with LocalCUDACluster( scheduler_port=0, asynchronous=True, diagnostics_port=None ) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == utils.get_n_gpus() # CUDA_VISIBLE_DEVICES cycles properly def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] result = await client.run(get_visible_devices) assert all(len(v.split(",")) == utils.get_n_gpus() for v in result.values()) for i in range(utils.get_n_gpus()): assert {int(v.split(",")[i]) for v in result.values()} == set( range(utils.get_n_gpus()) ) # Use full memory assert sum(w.memory_limit for w in cluster.workers) == TOTAL_MEMORY
def test_dashboard_address(loop): # noqa: F811 os.environ["CUDA_VISIBLE_DEVICES"] = "0" with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--dashboard-address", "127.0.0.1:9370", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) dashboard_addresses = client.run( lambda dask_worker: dask_worker._dashboard_address) for v in dashboard_addresses.values(): assert v == "127.0.0.1:9370"
def test_rmm_managed(loop): # noqa: F811 rmm = pytest.importorskip("rmm") with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--host", "127.0.0.1", "--rmm-managed-memory", "--no-dashboard", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) memory_resource_type = client.run( rmm.mr.get_current_device_resource_type) for v in memory_resource_type.values(): assert v is rmm.mr.ManagedMemoryResource
def test_rmm_logging(loop): # noqa: F811 rmm = pytest.importorskip("rmm") with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--host", "127.0.0.1", "--rmm-pool-size", "2 GB", "--rmm-log-directory", ".", "--no-dashboard", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) memory_resource_type = client.run( rmm.mr.get_current_device_resource_type) for v in memory_resource_type.values(): assert v is rmm.mr.LoggingResourceAdaptor
def initialize_cluster(use_gpu=True, n_cpu=None, n_gpu=-1): enable_tcp_over_ucx = True enable_nvlink = True enable_infiniband = True logger.info('Starting dash cluster...') if use_gpu: initialize.initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) if n_gpu == -1: n_gpu = get_n_gpus() device_list = cuda_visible_devices(1, range(n_gpu)).split(',') CUDA_VISIBLE_DEVICES = [] for device in device_list: try: CUDA_VISIBLE_DEVICES.append(int(device)) except ValueError as vex: logger.warn(vex) logger.info('Using GPUs {} ...'.format(CUDA_VISIBLE_DEVICES)) cluster = LocalCUDACluster(protocol="ucx", dashboard_address=':8787', CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) else: logger.info('Using {} CPUs ...'.format(n_cpu)) cluster = LocalCluster(dashboard_address=':8787', n_workers=n_cpu, threads_per_worker=4) client = Client(cluster) client.run(cupy.cuda.set_allocator) return client
def test_get_device_total_memory(): for i in range(get_n_gpus()): with cuda.gpus[i]: total_mem = get_device_total_memory(i) assert type(total_mem) is int assert total_mem > 0
def test_cpu_affinity(): for i in range(get_n_gpus()): affinity = get_cpu_affinity(i) os.sched_setaffinity(0, affinity) assert os.sched_getaffinity(0) == set(affinity)
def test_get_n_gpus(): assert isinstance(get_n_gpus(), int) assert get_n_gpus() == 3