async def test_scale_up_down(cleanup): start = time() async with SpecCluster( scheduler=scheduler, workers={ "slow": { "cls": SlowWorker, "options": { "delay": 5 } }, "fast": { "cls": Worker, "options": {} }, }, asynchronous=True, ) as cluster: cluster.scale(1) # remove a worker, hopefully the one we don't have await cluster assert list(cluster.worker_spec) == ["fast"] cluster.scale(0) await cluster assert not cluster.worker_spec
async def test_run_spec_cluster_worker_names(cleanup): worker = {"cls": Worker, "options": {"nthreads": 1}} class MyCluster(SpecCluster): def _new_worker_name(self, worker_number): return f"prefix-{self.name}-{worker_number}-suffix" async with SpecCluster(asynchronous=True, scheduler=scheduler, worker=worker) as cluster: cluster.scale(2) await cluster worker_names = [0, 1] assert list(cluster.worker_spec) == worker_names assert sorted(list(cluster.workers)) == worker_names async with MyCluster(asynchronous=True, scheduler=scheduler, worker=worker, name="test-name") as cluster: worker_names = [ "prefix-test-name-0-suffix", "prefix-test-name-1-suffix" ] cluster.scale(2) await cluster assert list(cluster.worker_spec) == worker_names assert sorted(list(cluster.workers)) == worker_names
async def test_unexpected_closed_worker(cleanup): worker = {"cls": Worker, "options": {"nthreads": 1}} with dask.config.set({"distributed.deploy.lost-worker-timeout": "10ms"}): async with SpecCluster(asynchronous=True, scheduler=scheduler, worker=worker) as cluster: assert not cluster.workers assert not cluster.worker_spec # Scale up cluster.scale(2) assert not cluster.workers assert cluster.worker_spec await cluster assert len(cluster.workers) == 2 # Close one await list(cluster.workers.values())[0].close() start = time() while len(cluster.workers) > 1: # wait for messages to flow around await asyncio.sleep(0.01) assert time() < start + 2 assert len(cluster.workers) == 1 assert len(cluster.worker_spec) == 2 await cluster assert len(cluster.workers) == 2
async def test_startup(cleanup): start = time() async with SpecCluster( scheduler=scheduler, workers={ 0: { "cls": Worker, "options": {} }, 1: { "cls": SlowWorker, "options": { "delay": 5 } }, 2: { "cls": SlowWorker, "options": { "delay": 0 } }, }, asynchronous=True, ) as cluster: assert len(cluster.workers) == len(cluster.worker_spec) == 3 assert time() < start + 5 assert 0 <= len(cluster.scheduler_info["workers"]) <= 2 async with Client(cluster, asynchronous=True) as client: await client.wait_for_workers(n_workers=2)
async def test_adaptive_killed_worker(cleanup): with dask.config.set({"distributed.deploy.lost-worker-timeout": 0.1}): async with SpecCluster( asynchronous=True, worker={ "cls": Nanny, "options": { "nthreads": 1 } }, scheduler={ "cls": Scheduler, "options": { "port": 0 } }, ) as cluster: async with Client(cluster, asynchronous=True) as client: cluster.adapt(minimum=1, maximum=1) # Scale up a cluster with 1 worker. while len(cluster.workers) != 1: await asyncio.sleep(0.01) future = client.submit(sleep, 0.1) # Kill the only worker. [worker_id] = cluster.workers await cluster.workers[worker_id].kill() # Wait for the worker to re-spawn and finish sleeping. await future.result(timeout=5)
async def test_logs(cleanup): worker = {"cls": Worker, "options": {"nthreads": 1}} async with SpecCluster( asynchronous=True, scheduler=scheduler, worker=worker ) as cluster: cluster.scale(2) await cluster logs = await cluster.logs() assert is_valid_xml("<div>" + logs._repr_html_() + "</div>") assert "Scheduler" in logs for worker in cluster.scheduler.workers: assert worker in logs assert "Registered" in str(logs) logs = await cluster.logs(scheduler=True, workers=False) assert list(logs) == ["Scheduler"] logs = await cluster.logs(scheduler=False, workers=False) assert list(logs) == [] logs = await cluster.logs(scheduler=False, workers=True) assert set(logs) == set(cluster.scheduler.workers) w = toolz.first(cluster.scheduler.workers) logs = await cluster.logs(scheduler=False, workers=[w]) assert set(logs) == {w}
def test_spec_close_clusters(loop): workers = {0: {"cls": Worker}} scheduler = {"cls": Scheduler, "options": {"port": 0}} cluster = SpecCluster(workers=workers, scheduler=scheduler, loop=loop) assert cluster in SpecCluster._instances close_clusters() assert cluster.status == "closed"
async def test_scale(cleanup): worker = {"cls": Worker, "options": {"nthreads": 1}} async with SpecCluster(asynchronous=True, scheduler=scheduler, worker=worker) as cluster: assert not cluster.workers assert not cluster.worker_spec # Scale up cluster.scale(2) assert not cluster.workers assert cluster.worker_spec await cluster assert len(cluster.workers) == 2 # Scale down cluster.scale(1) assert len(cluster.workers) == 2 await cluster assert len(cluster.workers) == 1 # Can use with await await cluster.scale(2) await cluster assert len(cluster.workers) == 2
async def test_adaptive(cleanup): start = time() async with SpecCluster( scheduler=scheduler, workers={"fast": { "cls": Worker, "options": {} }}, worker={ "cls": SlowWorker, "options": { "delay": 5 } }, asynchronous=True, ) as cluster: cluster.adapt(minimum=1, maximum=4, target_duration="1s", interval="20ms") async with Client(cluster, asynchronous=True) as client: futures = client.map(slowinc, range(200), delay=0.1) while len(cluster.worker_spec) <= 1: await asyncio.sleep(0.05) del futures while len(cluster.worker_spec) > 1: await asyncio.sleep(0.05) assert list(cluster.worker_spec) == ["fast"]
async def test_bad_close(cleanup): with warnings.catch_warnings(record=True) as record: cluster = SpecCluster(workers=worker_spec, scheduler=scheduler, asynchronous=True) await cluster.close() assert not record
async def test_nanny_port(): scheduler = {"cls": Scheduler} workers = {0: {"cls": Nanny, "options": {"port": 9200}}} async with SpecCluster(scheduler=scheduler, workers=workers, asynchronous=True) as cluster: pass
def test_loop_started(): cluster = SpecCluster(worker_spec, scheduler={ "cls": Scheduler, "options": { "port": 0 } })
async def test_broken_worker(): with pytest.raises(Exception) as info: async with SpecCluster( asynchronous=True, workers={"good": {"cls": Worker}, "bad": {"cls": BrokenWorker}}, ) as cluster: pass assert "Broken" in str(info.value)
async def test_dashboard_link(cleanup): async with SpecCluster( workers=worker_spec, scheduler={ "cls": Scheduler, "options": {"port": 0, "dashboard_address": ":12345"}, }, asynchronous=True, ) as cluster: assert "12345" in cluster.dashboard_link
async def test_MultiWorker(cleanup): async with SpecCluster( scheduler=scheduler, worker={ "cls": MultiWorker, "options": { "n": 2, "nthreads": 4, "memory_limit": "4 GB" }, "group": ["-0", "-1"], }, asynchronous=True, ) as cluster: s = cluster.scheduler async with Client(cluster, asynchronous=True) as client: cluster.scale(2) await cluster assert len(cluster.worker_spec) == 2 await client.wait_for_workers(4) while len(cluster.scheduler_info["workers"]) < 4: await asyncio.sleep(0.01) while "workers=4" not in repr(cluster): await asyncio.sleep(0.1) workers_line = re.search("(Workers.+)", cluster._repr_html_()).group(1) assert re.match("Workers.*4", workers_line) cluster.scale(1) await cluster assert len(s.workers) == 2 cluster.scale(memory="6GB") await cluster assert len(cluster.worker_spec) == 2 assert len(s.workers) == 4 assert cluster.plan == {ws.name for ws in s.workers.values()} cluster.scale(cores=10) await cluster assert len(cluster.workers) == 3 adapt = cluster.adapt(minimum=0, maximum=4) for i in range(adapt.wait_count): # relax down to 0 workers await adapt.adapt() await cluster assert not s.workers future = client.submit(lambda x: x + 1, 10) await future assert len(cluster.workers) == 1
async def test_ProcessInterfaceValid(cleanup): async with SpecCluster(scheduler=scheduler, worker={"cls": ProcessInterface}, asynchronous=True) as cluster: cluster.scale(2) await cluster assert len(cluster.worker_spec) == len(cluster.workers) == 2 cluster.scale(1) await cluster assert len(cluster.worker_spec) == len(cluster.workers) == 1
async def test_scale_cores_memory(cleanup): async with SpecCluster( scheduler=scheduler, worker={"cls": Worker, "options": {"nthreads": 1}}, asynchronous=True, ) as cluster: cluster.scale(cores=2) assert len(cluster.worker_spec) == 2 with pytest.raises(ValueError) as info: cluster.scale(memory="5GB") assert "memory" in str(info.value)
async def test_restart(): """Regression test for https://github.com/dask/distributed/issues/3062""" worker = {"cls": Nanny, "options": {"nthreads": 1}} async with SpecCluster(asynchronous=True, scheduler=scheduler, worker=worker) as cluster: async with Client(cluster, asynchronous=True) as client: cluster.scale(2) await cluster assert len(cluster.workers) == 2 await client.restart() while len(cluster.workers) < 2: await asyncio.sleep(0.01)
async def test_restart(cleanup): # Regression test for https://github.com/dask/distributed/issues/3062 worker = {"cls": Nanny, "options": {"nthreads": 1}} with dask.config.set({"distributed.deploy.lost-worker-timeout": "2s"}): async with SpecCluster(asynchronous=True, scheduler=scheduler, worker=worker) as cluster: async with Client(cluster, asynchronous=True) as client: cluster.scale(2) await cluster assert len(cluster.workers) == 2 await client.restart() start = time() while len(cluster.workers) < 2: await asyncio.sleep(0.5) assert time() < start + 60
async def test_MultiWorker(cleanup): async with SpecCluster( scheduler=scheduler, worker={ "cls": MultiWorker, "options": { "n": 2, "nthreads": 4, "memory_limit": "4 GB" }, "group": ["-0", "-1"], }, asynchronous=True, ) as cluster: s = cluster.scheduler async with Client(cluster, asynchronous=True) as client: cluster.scale(2) await cluster assert len(cluster.worker_spec) == 2 await client.wait_for_workers(4) assert "workers=4" in repr(cluster) cluster.scale(1) await cluster assert len(s.workers) == 2 cluster.scale(memory="6GB") await cluster assert len(cluster.worker_spec) == 2 assert len(s.workers) == 4 assert cluster.plan == {ws.name for ws in s.workers.values()} cluster.scale(cores=10) await cluster assert len(cluster.workers) == 3 adapt = cluster.adapt(minimum=0, maximum=4) for i in range(adapt.wait_count): # relax down to 0 workers await adapt.adapt() await cluster assert not s.workers future = client.submit(lambda x: x + 1, 10) await future assert len(cluster.workers) == 1
async def test_scheduler_info(cleanup): async with SpecCluster(workers=worker_spec, scheduler=scheduler, asynchronous=True) as cluster: assert (cluster.scheduler_info["id"] == cluster.scheduler.id ) # present at startup start = time() # wait for all workers while len(cluster.scheduler_info["workers"]) < len(cluster.workers): await asyncio.sleep(0.01) assert time() < start + 1 assert set(cluster.scheduler.identity()["workers"]) == set( cluster.scheduler_info["workers"]) assert (cluster.scheduler.identity()["services"] == cluster.scheduler_info["services"]) assert len(cluster.scheduler_info["workers"]) == len(cluster.workers)
async def test_widget(cleanup): async with SpecCluster( workers=worker_spec, scheduler=scheduler, asynchronous=True, worker={"cls": Worker, "options": {"nthreads": 1}}, ) as cluster: start = time() # wait for all workers while len(cluster.scheduler_info["workers"]) < len(cluster.worker_spec): await asyncio.sleep(0.01) assert time() < start + 1 assert "3" in cluster._widget_status() assert "GB" in cluster._widget_status() cluster.scale(5) assert "3 / 5" in cluster._widget_status()
def test_spec_sync(loop): worker_spec = { 0: { "cls": Worker, "options": { "nthreads": 1 } }, 1: { "cls": Worker, "options": { "nthreads": 2 } }, "my-worker": { "cls": MyWorker, "options": { "nthreads": 3 } }, } with SpecCluster(workers=worker_spec, scheduler=scheduler, loop=loop) as cluster: assert cluster.worker_spec is worker_spec assert len(cluster.workers) == 3 assert set(cluster.workers) == set(worker_spec) assert isinstance(cluster.workers[0], Worker) assert isinstance(cluster.workers[1], Worker) assert isinstance(cluster.workers["my-worker"], MyWorker) assert cluster.workers[0].nthreads == 1 assert cluster.workers[1].nthreads == 2 assert cluster.workers["my-worker"].nthreads == 3 with Client(cluster, loop=loop) as client: assert cluster.loop is cluster.scheduler.loop assert cluster.loop is client.loop result = client.submit(lambda x: x + 1, 10).result() assert result == 11
async def test_specification(): async with SpecCluster(workers=worker_spec, scheduler=scheduler, asynchronous=True) as cluster: assert cluster.worker_spec is worker_spec assert len(cluster.workers) == 3 assert set(cluster.workers) == set(worker_spec) assert isinstance(cluster.workers[0], Worker) assert isinstance(cluster.workers[1], Worker) assert isinstance(cluster.workers["my-worker"], MyWorker) assert cluster.workers[0].nthreads == 1 assert cluster.workers[1].nthreads == 2 assert cluster.workers["my-worker"].nthreads == 3 async with Client(cluster, asynchronous=True) as client: result = await client.submit(lambda x: x + 1, 10) assert result == 11 for name in cluster.workers: assert cluster.workers[name].name == name
def DGX( interface=None, dashboard_address=":8787", threads_per_worker=1, silence_logs=True, CUDA_VISIBLE_DEVICES=None, protocol=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, **kwargs, ): """ A Local Cluster for a DGX 1 machine NVIDIA's DGX-1 machine has a complex architecture mapping CPUs, GPUs, and network hardware. This function creates a local cluster that tries to respect this hardware as much as possible. It creates one Dask worker process per GPU, and assigns each worker process the correct CPU cores and Network interface cards to maximize performance. If UCX and UCX-Py are also available, it's possible to use InfiniBand and NVLink connections for optimal data transfer performance. That being said, things aren't perfect. Today a DGX has very high performance between certain sets of GPUs and not others. A Dask DGX cluster that uses only certain tightly coupled parts of the computer will have significantly higher bandwidth than a deployment on the entire thing. Parameters ---------- interface: str The external interface used to connect to the scheduler, usually the ethernet interface is used for connection (not the InfiniBand!). dashboard_address: str The address for the scheduler dashboard. Defaults to ":8787". threads_per_worker: int Number of threads to be used for each CUDA worker process. silence_logs: bool Disable logging for all worker processes CUDA_VISIBLE_DEVICES: str String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to different GPUs protocol: str Protocol to use for communication, e.g., "tcp" or "ucx" enable_tcp_over_ucx: bool Set environment variables to enable TCP over UCX, even if InfiniBand and NVLink are not supported or disabled. enable_infiniband: bool Set environment variables to enable UCX InfiniBand support, requires protocol='ucx' and implies enable_tcp_over_ucx=True. enable_nvlink: bool Set environment variables to enable UCX NVLink support, requires protocol='ucx' and implies enable_tcp_over_ucx=True. Raises ------ TypeError If enable_infiniband or enable_nvlink is True and protocol is not 'ucx' Examples -------- >>> from dask_cuda import DGX >>> from dask.distributed import Client >>> cluster = DGX() >>> client = Client(cluster) """ if (enable_tcp_over_ucx or enable_infiniband or enable_nvlink) and protocol != "ucx": raise TypeError( "Enabling InfiniBand or NVLink requires protocol='ucx'") ucx_net_devices = "" if enable_infiniband: ucx_net_devices = lambda i: "mlx5_%d:1" % (i // 2) spec = worker_spec( interface=interface, dashboard_address=dashboard_address, threads_per_worker=threads_per_worker, silence_logs=silence_logs, CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, ucx_net_devices=ucx_net_devices, enable_nvlink=enable_nvlink, protocol=protocol, **kwargs, ) scheduler = { "cls": Scheduler, "options": { "interface": interface, "protocol": protocol, "dashboard_address": dashboard_address, **get_preload_options( protocol=protocol, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ), }, } return SpecCluster(workers=spec, scheduler=scheduler, silence_logs=silence_logs, **kwargs)
def test_spec_close_clusters(loop): workers = {0: {"cls": Worker}} cluster = SpecCluster(workers=workers, scheduler=scheduler, loop=loop) assert cluster in SpecCluster._instances close_clusters() assert cluster.status == Status.closed
def DGX(interface="ib", dashboard_address=":8787", threads_per_worker=1, silence_logs=True, CUDA_VISIBLE_DEVICES=None, **kwargs): """ A Local Cluster for a DGX 1 machine NVIDIA's DGX-1 machine has a complex architecture mapping CPUs, GPUs, and network hardware. This function creates a local cluster that tries to respect this hardware as much as possible. It creates one Dask worker process per GPU, and assigns each worker process the correct CPU cores and Network interface cards to maximize performance. That being said, things aren't perfect. Today a DGX has very high performance between certain sets of GPUs and not others. A Dask DGX cluster that uses only certain tightly coupled parts of the computer will have significantly higher bandwidth than a deployment on the entire thing. Parameters ---------- interface: str The interface prefix for the infiniband networking cards. This is often "ib"` or "bond". We will add the numeric suffix 0,1,2,3 as appropriate. Defaults to "ib". dashboard_address: str The address for the scheduler dashboard. Defaults to ":8787". CUDA_VISIBLE_DEVICES: str String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to different GPUs Examples -------- >>> from dask_cuda import DGX >>> from dask.distributed import Client >>> cluster = DGX(interface='ib') >>> client = Client(cluster) """ if CUDA_VISIBLE_DEVICES is None: CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7") if isinstance(CUDA_VISIBLE_DEVICES, str): CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",") CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES)) memory_limit = TOTAL_MEMORY / 8 spec = { i: { "cls": Nanny, "options": { "env": { "CUDA_VISIBLE_DEVICES": cuda_visible_devices(ii, CUDA_VISIBLE_DEVICES), # 'UCX_NET_DEVICES': 'mlx5_%d:1' % (i // 2) "UCX_TLS": "rc,cuda_copy,cuda_ipc", }, "interface": interface + str(i // 2), "protocol": "ucx", "nthreads": threads_per_worker, "data": dict, "preload": ["dask_cuda.initialize_context"], "dashboard_address": ":0", "plugins": [CPUAffinity(affinity[i])], "silence_logs": silence_logs, "memory_limit": memory_limit, }, } for ii, i in enumerate(CUDA_VISIBLE_DEVICES) } scheduler = { "cls": Scheduler, "options": { "interface": interface + str(CUDA_VISIBLE_DEVICES[0] // 2), "protocol": "ucx", "dashboard_address": dashboard_address, }, } return SpecCluster(workers=spec, scheduler=scheduler, silence_logs=silence_logs, **kwargs)
def test_loop_started(): cluster = SpecCluster(worker_spec)
def test_loop_started(): with SpecCluster(worker_spec, scheduler=scheduler): pass