async def test_stress(): from distributed.utils import get_ip_interface try: # this check should be removed once UCX + TCP works get_ip_interface("ib0") except Exception: pytest.skip("ib0 interface not found") import dask.array as da from distributed import wait chunksize = "10 MB" async with LocalCluster( protocol="ucx", interface="ib0", asynchronous=True ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState() x = rs.random((10000, 10000), chunks=(-1, chunksize)) x = x.persist() await wait(x) for i in range(10): x = x.rechunk((chunksize, -1)) x = x.rechunk((-1, chunksize)) x = x.persist() await wait(x)
def test_get_ip_interface(): if sys.platform == 'darwin': assert get_ip_interface('lo0') == '127.0.0.1' elif sys.platform.startswith('linux'): assert get_ip_interface('lo') == '127.0.0.1' else: pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,)) with pytest.raises(KeyError): get_ip_interface('__non-existent-interface')
def test_get_ip_interface(): if sys.platform == "darwin": assert get_ip_interface("lo0") == "127.0.0.1" elif sys.platform.startswith("linux"): assert get_ip_interface("lo") == "127.0.0.1" else: pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,)) with pytest.raises(KeyError): get_ip_interface("__non-existent-interface")
async def test_interface_async(loop, Worker): from distributed.utils import get_ip_interface psutil = pytest.importorskip("psutil") if_names = sorted(psutil.net_if_addrs()) for if_name in if_names: try: ipv4_addr = get_ip_interface(if_name) except ValueError: pass else: if ipv4_addr == "127.0.0.1": break else: pytest.skip("Could not find loopback interface. " "Available interfaces are: %s." % (if_names, )) async with Scheduler(interface=if_name) as s: assert s.address.startswith("tcp://127.0.0.1") async with Worker(s.address, interface=if_name) as w: assert w.address.startswith("tcp://127.0.0.1") assert w.ip == "127.0.0.1" async with Client(s.address, asynchronous=True) as c: info = c.scheduler_info() assert "tcp://127.0.0.1" in info["address"] assert all("127.0.0.1" == d["host"] for d in info["workers"].values())
def test_interface(loop): if_names = sorted(psutil.net_if_addrs()) for if_name in if_names: try: ipv4_addr = get_ip_interface(if_name) except ValueError: pass else: if ipv4_addr == "127.0.0.1": break else: pytest.skip("Could not find loopback interface. " "Available interfaces are: %s." % (if_names, )) with popen(["dask-scheduler", "--no-dashboard", "--interface", if_name]) as s: with popen([ "dask-worker", "127.0.0.1:8786", "--no-dashboard", "--interface", if_name ]) as a: with Client("tcp://127.0.0.1:%d" % Scheduler.default_port, loop=loop) as c: start = time() while not len(c.nthreads()): sleep(0.1) assert time() - start < 30 info = c.scheduler_info() assert "tcp://127.0.0.1" in info["address"] assert all("127.0.0.1" == d["host"] for d in info["workers"].values())
def test_interface(loop): psutil = pytest.importorskip('psutil') if_names = sorted(psutil.net_if_addrs()) for if_name in if_names: try: ipv4_addr = get_ip_interface(if_name) except ValueError: pass else: if ipv4_addr == '127.0.0.1': break else: pytest.skip("Could not find loopback interface. " "Available interfaces are: %s." % (if_names,)) with popen(['dask-scheduler', '--no-bokeh', '--interface', if_name]) as s: with popen(['dask-worker', '127.0.0.1:8786', '--no-bokeh', '--interface', if_name]) as a: with Client('tcp://127.0.0.1:%d' % Scheduler.default_port, loop=loop) as c: start = time() while not len(c.ncores()): sleep(0.1) assert time() - start < 5 info = c.scheduler_info() assert 'tcp://127.0.0.1' in info['address'] assert all('127.0.0.1' == d['host'] for d in info['workers'].values())
def test_get_ip_interface(): if sys.platform == "darwin": assert get_ip_interface("lo0") == "127.0.0.1" elif sys.platform.startswith("linux"): assert get_ip_interface("lo") == "127.0.0.1" else: pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,)) non_existent_interface = "__non-existent-interface" expected_error_message = "{!r}.+network interface.+".format(non_existent_interface) if sys.platform == "darwin": expected_error_message += "'lo0'" elif sys.platform.startswith("linux"): expected_error_message += "'lo'" with pytest.raises(ValueError, match=expected_error_message): get_ip_interface(non_existent_interface)
def main(scheduler_file, interface, nthreads, local_directory, memory_limit, scheduler, bokeh_port, bokeh_prefix, nanny, bokeh_worker_port): if interface: host = get_ip_interface(interface) else: host = None if rank == 0 and scheduler: try: from distributed.bokeh.scheduler import BokehScheduler except ImportError: services = {} else: services = { ('bokeh', bokeh_port): partial(BokehScheduler, prefix=bokeh_prefix) } scheduler = Scheduler(scheduler_file=scheduler_file, loop=loop, services=services) addr = uri_from_host_port(host, None, 8786) scheduler.start(addr) try: loop.start() loop.close() finally: scheduler.stop() else: W = Nanny if nanny else Worker worker = W(scheduler_file=scheduler_file, loop=loop, name=rank if scheduler else None, ncores=nthreads, local_dir=local_directory, services={('bokeh', bokeh_worker_port): BokehWorker}, memory_limit=memory_limit) addr = uri_from_host_port(host, None, 0) @gen.coroutine def run(): yield worker._start(addr) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) loop.close() finally: pass @gen.coroutine def close(): yield worker._close(timeout=2) loop.run_sync(close)
def main(scheduler_file, interface, nthreads, local_directory, memory_limit, scheduler, bokeh_port, bokeh_prefix, nanny, bokeh_worker_port): if interface: host = get_ip_interface(interface) else: host = None if rank == 0 and scheduler: try: from distributed.bokeh.scheduler import BokehScheduler except ImportError: services = {} else: services = {('bokeh', bokeh_port): partial(BokehScheduler, prefix=bokeh_prefix)} scheduler = Scheduler(scheduler_file=scheduler_file, loop=loop, services=services) addr = uri_from_host_port(host, None, 8786) scheduler.start(addr) try: loop.start() loop.close() finally: scheduler.stop() else: W = Nanny if nanny else Worker worker = W(scheduler_file=scheduler_file, loop=loop, name=rank if scheduler else None, ncores=nthreads, local_dir=local_directory, services={('bokeh', bokeh_worker_port): BokehWorker}, memory_limit=memory_limit) addr = uri_from_host_port(host, None, 0) @gen.coroutine def run(): yield worker._start(addr) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) loop.close() finally: pass @gen.coroutine def close(): yield worker._close(timeout=2) loop.run_sync(close)
def main(scheduler_file, interface, nthreads, local_directory, memory_limit, scheduler): if interface: host = get_ip_interface(interface) else: host = None if rank == 0 and scheduler: scheduler = Scheduler(scheduler_file=scheduler_file, loop=loop, services={('bokeh', 8787): BokehScheduler}) addr = uri_from_host_port(host, None, 8786) scheduler.start(addr) try: loop.start() loop.close() finally: scheduler.stop() else: worker = Worker(scheduler_file=scheduler_file, loop=loop, name=rank if scheduler else None, ncores=nthreads, local_dir=local_directory, services={'bokeh': BokehWorker}, memory_limit=memory_limit) addr = uri_from_host_port(host, None, 0) @gen.coroutine def run(): yield worker._start(addr) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) loop.close() finally: pass @gen.coroutine def close(): yield worker._close(timeout=2) loop.run_sync(close)
def address_from_user_args( host=None, port=None, interface=None, protocol=None, peer=None, security=None, default_port=0, ) -> str: """Get an address to listen on from common user provided arguments""" if security and security.require_encryption and not protocol: protocol = "tls" if protocol and protocol.rstrip("://") == "inplace": if host or port or interface: raise ValueError( "Can not specify inproc protocol and host or port or interface" ) else: return "inproc://" if interface: if host: raise ValueError("Can not specify both interface and host", interface, host) else: host = get_ip_interface(interface) if protocol and host and "://" not in host: host = protocol.rstrip("://") + "://" + host if host or port: addr = uri_from_host_port(host, port, default_port) else: addr = "" if protocol: addr = protocol.rstrip("://") + "://" + addr.split("://")[-1] return addr
def _start(self, ip=None, n_workers=0): """ Start all cluster services. """ if self.status == "running": return if self.protocol == "inproc://": address = self.protocol else: if ip is None: if self.interface: ip = get_ip_interface(self.interface) else: ip = "127.0.0.1" if "://" in ip: address = ip else: address = self.protocol + ip if self.scheduler_port: address += ":" + str(self.scheduler_port) self.scheduler.start(address) yield [ self._start_worker( **self.worker_kwargs, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name="gpu-" + str(i), ) for i in range(n_workers) ] self.status = "running" raise gen.Return(self)
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, { 'prefix': bokeh_prefix }) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def get_host_from_interface(interface=None): if interface: host = get_ip_interface(interface) else: host = None return host
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, {'prefix': bokeh_prefix}) except ImportError as error: if str(error).startswith('No module named'): logger.info('Web dashboard not loaded. Unable to import bokeh') else: logger.info('Unable to import bokeh: %s' % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main( scheduler, host, nthreads, name, memory_limit, pid_file, reconnect, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, _ncores // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host: addr = uri_from_host_port(host, 0, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=(preload or []) + ["dask_cuda.initialize_context"], preload_argv=preload_argv, security=sec, contact_address=None, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, prefix, tls_ca_file, tls_cert, tls_key): if bokeh_internal_port: print("The --bokeh-internal-port keyword has been removed.\n" "The internal bokeh server is now the default bokeh server.\n" "Use --bokeh-port %d instead" % bokeh_internal_port) sys.exit(1) if prefix: print("The --prefix keyword has moved to --bokeh-prefix") sys.exit(1) sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {('http', http_port): HTTPScheduler} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = partial(BokehScheduler, prefix=bokeh_prefix) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) preload_modules(preload, parameter=scheduler, file_dir=local_directory) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nanny: port = nanny_port else: port = worker_port if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if scheduler_file: while not os.path.exists(scheduler_file): sleep(0.01) for i in range(10): try: with open(scheduler_file) as f: cfg = json.load(f) scheduler = cfg['address'] break except (ValueError, KeyError): # race with scheduler on file sleep(0.01) if not scheduler: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): try: if nanny: yield [n._close(timeout=2) for n in nannies] finally: loop.stop() def handle_signal(signum, frame): logger.info("Exiting on signal %d", signum) if loop._running: loop.add_callback_from_signal(loop.stop) else: exit(0) # NOTE: We can't use the generic install_signal_handlers() function from # distributed.cli.utils because we're handling the signal differently. signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) for n in nannies: n.start(addr) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") # Clean exit: unregister all workers from scheduler loop.run_sync(close_all)
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, host=host, preload=(preload or []) + ["dask_cuda.initialize_context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit( memory_limit, nthreads, total_cores=nprocs ), "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, log_directory=None, threads=None, shebang=None, python=sys.executable, config_name=None, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ super(JobQueueCluster, self).__init__() if threads is not None: raise ValueError(threads_deprecation_message) if config_name is None: raise NotImplementedError( "JobQueueCluster is an abstract class that should not be instantiated." ) if name is None: name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % config_name) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) if local_directory is None: local_directory = dask.config.get("jobqueue.%s.local-directory" % config_name) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) if dask.config.get("jobqueue.%s.threads", None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError( "You must specify how many cores to use per job like ``cores=8``" ) if memory is None: raise ValueError( "You must specify how much memory to use per job like ``memory='24 GB'``" ) # This attribute should be overridden self.job_header = None if interface: extra += ["--interface", interface] kwargs.setdefault("ip", get_ip_interface(interface)) else: kwargs.setdefault("ip", "") # Bokeh diagnostics server should listen on all interfaces kwargs.setdefault("dashboard_address", ("", 8787)) self.local_cluster = LocalCluster(n_workers=0, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self.shebang = shebang self._env_header = "\n".join(env_extra) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python) command_args = [dask_worker_command, self.scheduler.address] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", "%s--${JOB_ID}--" % name] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory)
def __init__(self, name='dask', queue='dav', project=None, threads_per_worker=4, processes=8, memory='7GB', walltime='00:30:00', interface=None, extra='', **kwargs): """ Initialize a SLURM Cluster Parameters ---------- name : str Name of worker jobs. Passed to `#SBATCH -J` option. queue : str Destination queue for each worker job. Passed to `#SBATCH -p` option. project : str Accounting string associated with each worker job. Passed to `#SBATCH -A` option. threads_per_worker : int Number of threads per process. processes : int Number of processes per node. memory : str Bytes of memory that the worker can use. This should be a string like "7GB" that can be interpretted both by PBS and Dask. walltime : str Walltime for each worker job. interface : str Network interface like 'eth0' or 'ib0'. extra : str Additional arguments to pass to `dask-worker` kwargs : dict Additional keyword arguments to pass to `LocalCluster` """ self._template = """ #!/bin/bash #SBATCH -J %(name)s #SBATCH -n %(processes)d #SBATCH -p %(queue)s #SBATCH -A %(project)s #SBATCH -t %(walltime)s #SBATCH -e %(name)s.err #SBATCH -o %(name)s.out %(base_path)s/dask-worker %(scheduler)s \ --nthreads %(threads_per_worker)d \ --nprocs %(processes)s \ --memory-limit %(memory)s \ --name %(name)s-%(n)d \ %(extra)s """.lstrip() if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() project = project or os.environ.get('SLURM_ACCOUNT') if not project: raise ValueError("Must specify a project like `project='UCLB1234' " "or set SLURM_ACCOUNT environment variable") self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs) memory = memory.replace(' ', '') self.config = { 'name': name, 'queue': queue, 'project': project, 'threads_per_worker': threads_per_worker, 'processes': processes, 'scheduler': self.scheduler.address, 'walltime': walltime, 'base_path': dirname, 'memory': memory, 'extra': extra } self.jobs = dict() self.n = 0 self._adaptive = None self._submitcmd = 'sbatch' self._cancelcmd = 'scancel' logger.debug("Job script: \n %s" % self.job_script())
def __init__(self, name=dask.config.get('jobqueue.name'), threads=dask.config.get('jobqueue.threads'), processes=dask.config.get('jobqueue.processes'), memory=dask.config.get('jobqueue.memory'), interface=dask.config.get('jobqueue.interface'), death_timeout=dask.config.get('jobqueue.death-timeout'), local_directory=dask.config.get('jobqueue.local-directory'), extra=dask.config.get('jobqueue.extra'), env_extra=dask.config.get('jobqueue.env-extra'), **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used # directly. # """ if not self.cancel_command or not self.submit_command: raise NotImplementedError('JobQueueCluster is an abstract class ' 'that should not be instanciated.') #This attribute should be overriden self.job_header = None if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs) # Keep information on process, threads and memory, for use in # subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_threads = threads self.name = name self.jobs = dict() self.n = 0 self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)) self._command_template = ' '.join( [dask_worker_command, self.scheduler.address]) if threads is not None: self._command_template += " --nthreads %d" % threads if processes is not None: self._command_template += " --nprocs %d" % processes if memory is not None: self._command_template += " --memory-limit %s" % memory if name is not None: self._command_template += " --name %s" % name self._command_template += "-%(n)d" # Keep %(n) to be replaced later if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error("Failed to launch worker. You cannot use the --port argument when nprocs > 1.") exit(1) if nprocs > 1 and not nanny: logger.error("Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1.") exit(1) if contact_address and not listen_address: logger.error("Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given.") exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file and 'scheduler-address' not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, 's') nannies = [t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, name=name if nprocs == 1 or not name else name + '-' + str(i), **kwargs) for i in range(nprocs)] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, walltime=None, threads=None, **kwargs ): """ """ # """ # This initializer should be considered as Abstract, and never used # directly. # """ if threads is not None: raise ValueError(threads_deprecation_message) if not self.scheduler_name: raise NotImplementedError('JobQueueCluster is an abstract class ' 'that should not be instanciated.') if name is None: name = dask.config.get('jobqueue.%s.name' % self.scheduler_name) if cores is None: cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name) if memory is None: memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name) if processes is None: processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name) if interface is None: interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name) if death_timeout is None: death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name) if local_directory is None: local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name) if extra is None: extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name) if env_extra is None: env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name) if dask.config.get('jobqueue.%s.threads', None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError("You must specify how many cores to use per job " "like ``cores=8``") if memory is None: raise ValueError("You must specify how much memory to use per job " "like ``memory='24 GB'``") #This attribute should be overriden self.job_header = None if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes(memory) self.worker_processes = processes self.worker_cores = cores self.name = name self.jobs = dict() self.n = 0 self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = ( '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)) self._command_template = ' '.join([dask_worker_command, self.scheduler.address]) self._command_template += " --nthreads %d" % self.worker_threads if processes is not None and processes > 1: self._command_template += " --nprocs %d" % processes mem = format_bytes(self.worker_memory / self.worker_processes) mem = mem.replace(' ', '') self._command_template += " --memory-limit %s" % mem if name is not None: self._command_template += " --name %s" % name self._command_template += "-%(n)d" # Keep %(n) to be replaced later if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() if tls_ca_file and tls_cert and tls_key: sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) else: sec = None try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover rmm_pool_size = parse_bytes(rmm_pool_size) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=get_ucx_net_devices( cuda_device_index=i, ucx_net_devices=net_devices, get_openfabrics=False, get_network=True, ), preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)}, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh, bokeh_whitelist, prefix, use_xheaders, pid_file, scheduler_file, interface): if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {('http', http_port): HTTPScheduler} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_internal_port)] = BokehScheduler scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file) scheduler.start(addr) bokeh_proc = None if _bokeh: if bokeh_port == 0: # This is a hack and not robust bokeh_port = open_port() # This port may be taken by the OS try: # before we successfully pass it to Bokeh from distributed.bokeh.application import BokehWebInterface bokeh_proc = BokehWebInterface(http_port=http_port, scheduler_address=scheduler.address, bokeh_port=bokeh_port, bokeh_whitelist=bokeh_whitelist, show=show, prefix=prefix, use_xheaders=use_xheaders, quiet=False) except ImportError: logger.info("Please install Bokeh to get Web UI") except Exception as e: logger.warn("Could not start Bokeh web UI", exc_info=True) logger.info('-' * 47) try: loop.start() loop.close() finally: scheduler.stop() if bokeh_proc: bokeh_proc.close() logger.info("End scheduler at %r", addr)
def __init__(self, scheduler=None, name=None, cores=None, memory=None, processes=None, nanny=True, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, header_skip=None, log_directory=None, shebang=None, python=sys.executable, job_name=None, config_name=None, **kwargs): self.scheduler = scheduler self.job_id = None super().__init__() default_config_name = self.default_config_name() if config_name is None: config_name = default_config_name self.config_name = config_name if cores is None: cores = dask.config.get("jobqueue.%s.cores" % self.config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % self.config_name) if cores is None or memory is None: job_class_name = self.__class__.__name__ cluster_class_name = job_class_name.replace("Job", "Cluster") raise ValueError( "You must specify how much cores and memory per job you want to use, for example:\n" "cluster = {}(cores={}, memory={!r})".format( cluster_class_name, cores or 8, memory or "24GB")) if job_name is None: job_name = dask.config.get("jobqueue.%s.name" % self.config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % self.config_name) if processes is None: processes, _ = nprocesses_nthreads(cores) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % self.config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % self.config_name) if local_directory is None: local_directory = dask.config.get("jobqueue.%s.local-directory" % self.config_name) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % self.config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % self.config_name) if header_skip is None: header_skip = dask.config.get( "jobqueue.%s.header-skip" % self.config_name, ()) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % self.config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % self.config_name) # This attribute should be set in the derived class self.job_header = None if interface: extra = extra + ["--interface", interface] kwargs.setdefault("host", get_ip_interface(interface)) else: kwargs.setdefault("host", "") # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name self.job_name = job_name self.shebang = shebang self._env_header = "\n".join(filter(None, env_extra)) self.header_skip = set(header_skip) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python) command_args = [dask_worker_command, self.scheduler] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", str(name)] command_args += ["--nanny" if nanny else "--no-nanny"] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory)
def __init__( self, scheduler=None, name=None, cores=None, memory=None, processes=None, nanny=True, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, header_skip=None, log_directory=None, shebang=None, python=sys.executable, job_name=None, config_name=None, **kwargs ): self.scheduler = scheduler self.job_id = None super().__init__() if config_name is None: config_name = getattr(type(self), "config_name") if config_name is None: raise ValueError( "Looks like you are trying to create a class that inherits from dask_jobqueue.core.Job. " "If that is the case, you need to:\n" "- set the 'config_name' class variable to a non-None value\n" "- create a section in jobqueue.yaml with the value of 'config_name'\n" "If that is not the case, please open an issue in https://github.com/dask/dask-jobqueue/issues." ) if job_name is None: job_name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % config_name) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) if local_directory is None: local_directory = dask.config.get( "jobqueue.%s.local-directory" % config_name ) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) if header_skip is None: header_skip = dask.config.get("jobqueue.%s.header-skip" % config_name, ()) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) if cores is None or memory is None: raise ValueError( "You must specify how much cores and memory per job you want to use, for example:\n" "cluster = {}(cores={}, memory={!r})".format( self.__class__.__name__, cores or 8, memory or "24GB" ) ) # This attribute should be overridden self.job_header = None if interface: extra = extra + ["--interface", interface] kwargs.setdefault("host", get_ip_interface(interface)) else: kwargs.setdefault("host", "") # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes(memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name self.job_name = job_name self.shebang = shebang self._env_header = "\n".join(filter(None, env_extra)) self.header_skip = set(header_skip) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python ) command_args = [dask_worker_command, self.scheduler] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", str(name)] command_args += ["--nanny" if nanny else "--no-nanny"] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory)
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, temp_filename, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload): if nanny: port = nanny_port else: port = worker_port if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: services[('bokeh', bokeh_port)] = BokehWorker if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if scheduler_file: while not os.path.exists(scheduler_file): sleep(0.01) for i in range(10): try: with open(scheduler_file) as f: cfg = json.load(f) scheduler = cfg['address'] break except (ValueError, KeyError): # race with scheduler on file sleep(0.01) if not scheduler: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") nannies = [ t(scheduler, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, **kwargs) for i in range(nprocs) ] if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) for n in nannies: if host: n.start((host, port)) else: n.start(port) if t is Nanny: global_nannies.append(n) if temp_filename: @gen.coroutine def f(): while nannies[0].status != 'running': yield gen.sleep(0.01) import json msg = { 'port': nannies[0].port, 'local_directory': nannies[0].local_dir } with open(temp_filename, 'w') as f: json.dump(msg, f) loop.add_callback(f) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") loop.close() # Clean exit: unregister all workers from scheduler loop2 = IOLoop() @gen.coroutine def f(): with rpc(nannies[0].scheduler.address) as scheduler: if nanny: yield gen.with_timeout(timeout=timedelta(seconds=2), future=All([ scheduler.unregister( address=n.worker_address, close=True) for n in nannies if n.process and n.worker_address ]), io_loop=loop2) loop2.run_sync(f) if nanny: for n in nannies: if isalive(n.process): n.process.terminate() if nanny: start = time() while (any(isalive(n.process) for n in nannies) and time() < start + 1): sleep(0.1) for nanny in nannies: nanny.stop()
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm): loop = IOLoop.current() ucp = pytest.importorskip("ucp") cm_protocol = "rdmacm" if enable_rdmacm else "sockcm" net_devices = _get_dgx_net_devices() openfabrics_devices = [d.split(",")[0] for d in net_devices] sched_addr = "127.0.0.1" # Enable proper variables for scheduler sched_env = os.environ.copy() sched_env["DASK_UCX__INFINIBAND"] = "True" sched_env["DASK_UCX__TCP"] = "True" sched_env["DASK_UCX__CUDA_COPY"] = "True" sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0] if enable_rdmacm: sched_env["DASK_UCX__RDMACM"] = "True" sched_addr = get_ip_interface("ib0") sched_url = "ucx://" + sched_addr + ":9379" # Enable proper variables for workers worker_ucx_opts = [ "--enable-infiniband", "--net-devices", "auto", ] if enable_rdmacm: worker_ucx_opts.append("--enable-rdmacm") # Enable proper variables for client initialize( enable_tcp_over_ucx=True, enable_infiniband=True, enable_rdmacm=enable_rdmacm, net_devices=openfabrics_devices[0], ) with subprocess.Popen( [ "dask-scheduler", "--protocol", "ucx", "--host", sched_addr, "--port", "9379", "--no-dashboard", ], env=sched_env, ) as sched_proc: # Scheduler with UCX will take a few seconds to fully start sleep(5) with subprocess.Popen([ "dask-cuda-worker", sched_url, "--no-dashboard", ] + worker_ucx_opts) as worker_proc: with Client(sched_url, loop=loop) as client: def _timeout_callback(): # We must ensure processes are terminated to avoid hangs # if a timeout occurs worker_proc.kill() sched_proc.kill() assert wait_workers(client, timeout_callback=_timeout_callback) workers_tls = client.run(lambda: ucp.get_config()["TLS"]) workers_tls_priority = client.run( lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"]) for tls, tls_priority in zip(workers_tls.values(), workers_tls_priority.values()): assert cm_protocol in tls assert cm_protocol in tls_priority worker_net_devices = client.run( lambda: ucp.get_config()["NET_DEVICES"]) cuda_visible_devices = client.run( lambda: os.environ["CUDA_VISIBLE_DEVICES"]) for i, v in enumerate( zip(worker_net_devices.values(), cuda_visible_devices.values())): net_dev = v[0] dev_idx = int(v[1].split(",")[0]) assert net_dev == openfabrics_devices[dev_idx] # A dask-worker with UCX protocol will not close until some work # is dispatched, therefore we kill the worker and scheduler to # ensure timely closing. worker_proc.kill() sched_proc.kill()
def test_get_ip_interface(): iface = "lo0" if MACOS else "lo" assert get_ip_interface(iface) == "127.0.0.1" with pytest.raises(ValueError, match=f"'__notexist'.+network interface.+'{iface}'"): get_ip_interface("__notexist")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, contact_address=contact_address, **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n.start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("bokeh", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {"worker_port": worker_port, "listen_address": listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")