def worker(argv=None): args = worker_parser.parse_args(argv) worker_name = args.name or getenv("DASK_GATEWAY_WORKER_NAME") nthreads = args.nthreads memory_limit = args.memory_limit gateway = make_gateway_client() security = make_security() enable_proctitle_on_current() enable_proctitle_on_children() loop = IOLoop.current() async def run(): worker = await start_worker(gateway, security, worker_name, nthreads, memory_limit) while worker.status != "closed": await gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def worker(nthreads=None, memory_limit=None): enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() worker = Nanny(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit, worker_port=0) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def worker(nthreads=None, memory_limit=None): # pragma: nocover enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() worker = Nanny(scheduler, loop=loop, memory_limit=memory_limit, worker_port=0, nthreads=nthreads) async def cleanup(): await worker.close(timeout=2) install_signal_handlers(loop, cleanup=cleanup) async def run(): await worker await worker.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def scheduler(argv=None): args = scheduler_parser.parse_args(argv) gateway = make_gateway_client() security = make_security() loop = IOLoop.current() install_signal_handlers(loop) enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) async def run(): scheduler = await start_scheduler(gateway, security, adaptive_period=args.adaptive_period) await scheduler.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def worker(argv=None): args = worker_parser.parse_args(argv) worker_name = args.name or getenv("DASK_GATEWAY_WORKER_NAME") nthreads = args.nthreads memory_limit = args.memory_limit scheduler_address = args.scheduler_address nanny = args.nanny dashboard_address = args.dashboard_address gateway = make_gateway_client() security = make_security() enable_proctitle_on_current() enable_proctitle_on_children() loop = IOLoop.current() async def run(): worker = await start_worker( gateway, security, worker_name, nthreads, memory_limit, scheduler_address, dashboard_address=dashboard_address, nanny=nanny, ) await worker.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def scheduler(): # pragma: nocover app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = 'tcp://' loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): try: from distributed.dashboard.scheduler import BokehScheduler except ImportError: # Old import location from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) # Set dask.dashboard before dask.scheduler since the YarnCluster object # waits on dask.scheduler only if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() app_client.kv['dask.scheduler'] = scheduler.address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def start_worker(nthreads=None, memory_limit=None): enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 1e6) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() # Until the config patch is merged, we can't use the nanny process since # there's no way to monkey patch config inside the forkserver process if hasattr(dask.config, 'PATH'): worker = Nanny(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit, worker_port=0) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) else: worker = Worker(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def main(): app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = uri_from_host_port('', None, 0) loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) app_client.kv['dask.scheduler'] = scheduler.address.encode() if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def scheduler(argv=None): scheduler_parser.parse_args(argv) gateway = make_gateway_client() security = make_security() loop = IOLoop.current() install_signal_handlers(loop) enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop.add_callback(start_scheduler, gateway, security) loop.start()
def scheduler(): # pragma: nocover app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() scheduler = Scheduler(loop=loop, dashboard_address=("", 0)) install_signal_handlers(loop) def post_addresses(): # Set dask.dashboard before dask.scheduler since the YarnCluster object # waits on dask.scheduler only if "dashboard" in scheduler.services: bokeh_port = scheduler.services["dashboard"].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = "http://%s:%d" % (bokeh_host, bokeh_port) app_client.kv["dask.dashboard"] = bokeh_address.encode() app_client.kv["dask.scheduler"] = scheduler.address.encode() async def run(): await scheduler await loop.run_in_executor(None, post_addresses) await scheduler.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: scheduler.stop()
def main( host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key, dashboard_address, ): enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info("-" * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[("bokeh", dashboard_address)] = ( BokehScheduler, { "prefix": bokeh_prefix }, ) except ImportError as error: if str(error).startswith("No module named"): logger.info( "Web dashboard not loaded. Unable to import bokeh") else: logger.info("Unable to import bokeh: %s" % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get("distributed.scheduler.preload") if not preload_argv: preload_argv = dask.config.get("distributed.scheduler.preload-argv") preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() if tls_ca_file and tls_cert and tls_key: sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) else: sec = None try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover rmm_pool_size = parse_bytes(rmm_pool_size) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=get_ucx_net_devices( cuda_device_index=i, ucx_net_devices=net_devices, get_openfabrics=False, get_network=True, ), preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)}, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( host, port, bokeh_port, show, dashboard, bokeh, dashboard_prefix, use_xheaders, pid_file, local_directory, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs ): g0, g1, g2 = gc.get_threshold() # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port ) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = Security( **{ k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_scheduler_cert", tls_cert), ("tls_scheduler_key", tls_key), ] if v is not None } ) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler( loop=loop, security=sec, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": {"prefix": dashboard_prefix}}, **kwargs, ) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) async def run(): await scheduler await scheduler.finished() try: loop.run_sync(run) finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", scheduler.address)
def main(host, port, bokeh_port, show, dashboard, bokeh, dashboard_prefix, use_xheaders, pid_file, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = { k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_scheduler_cert", tls_cert), ("tls_scheduler_key", tls_key), ] if v is not None } if "DASK_INTERNAL_INHERIT_CONFIG" in os.environ: config = deserialize_for_cli( os.environ["DASK_INTERNAL_INHERIT_CONFIG"]) # Update the global config given priority to the existing global config dask.config.update(dask.config.global_config, config, priority="old") if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler(loop=loop, security=sec, host=host, port=port, dashboard=dashboard, dashboard_address=dashboard_address, http_prefix=dashboard_prefix, **kwargs) logger.info("-" * 47) install_signal_handlers(loop) async def run(): await scheduler await scheduler.finished() try: loop.run_sync(run) finally: scheduler.stop() logger.info("End scheduler at %r", scheduler.address)
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): logger = SchedulerLogger.getLogger() enable_proctitle_on_current() enable_proctitle_on_children() log_metrics = EdasEnv.getBool("log.metrics", False) logger.info(f"Log Metrics: {log_metrics}") plugins = [EDASSchedulerPlugin(logger)] if log_metrics else [] sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, { 'prefix': bokeh_prefix }) except ImportError as error: if str(error).startswith('No module named'): logger.info( 'Web dashboard not loaded. Unable to import bokeh') else: logger.info('Unable to import bokeh: %s' % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) for plugin in plugins: logger.info(f"@SP: Adding scheduler plugin: {plugin}") scheduler.add_plugin(plugin) scheduler.start(addr) comm = Comm(scheduler) comm.start() if not preload: preload = dask.config.get('distributed.scheduler.preload', {}) if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv', {}) preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) def shutdown_scheduler(): comm.terminate() scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr) def close_loop(): loop.stop() loop.close() shutdown_scheduler() atexit.register(close_loop) try: loop.start() loop.close() finally: shutdown_scheduler()
def main( host, port, bokeh_port, show, dashboard, dashboard_prefix, use_xheaders, pid_file, scheduler_file, interface, protocol, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler( loop=loop, scheduler_file=scheduler_file, security=sec, host=host, port=port, interface=interface, protocol=protocol, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": { "prefix": dashboard_prefix }}, ) scheduler.start() if not preload: preload = dask.config.get("distributed.scheduler.preload") if not preload_argv: preload_argv = dask.config.get("distributed.scheduler.preload-argv") preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", scheduler.address)
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, { 'prefix': bokeh_prefix }) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main( scheduler, host, nthreads, name, memory_limit, pid_file, reconnect, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, _ncores // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host: addr = uri_from_host_port(host, 0, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=(preload or []) + ["dask_cuda.initialize_context"], preload_argv=preload_argv, security=sec, contact_address=None, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, contact_address=contact_address, name=name if nprocs == 1 else name + '-' + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, pid_file, resources, dashboard, bokeh, bokeh_port, scheduler_file, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh sec = Security( **{ k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_worker_cert", tls_cert), ("tls_worker_key", tls_key), ] if v is not None }) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = CPU_COUNT // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs.update({ "worker_port": worker_port, "listen_address": listen_address }) t = Nanny else: if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") with ignoring(TypeError, ValueError): name = int(name) nannies = [ t(scheduler, scheduler_file=scheduler_file, nthreads=nthreads, loop=loop, resources=resources, security=sec, contact_address=contact_address, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": { "prefix": dashboard_prefix }}, name=name if nprocs == 1 or name is None or name == "" else str(name) + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except TimeoutError: # We already log the exception in nanny / worker. Don't do it again. raise TimeoutError("Timed out starting worker.") from None except KeyboardInterrupt: pass finally: logger.info("End worker")
def __init__( self, scheduler=None, host=None, nthreads=0, name=None, memory_limit="auto", device_memory_limit="auto", rmm_pool_size=None, rmm_managed_memory=False, pid_file=None, resources=None, dashboard=True, dashboard_address=":0", local_directory=None, scheduler_file=None, interface=None, death_timeout=None, preload=[], dashboard_prefix=None, security=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, net_devices=None, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" enable_proctitle_on_current() enable_proctitle_on_children() try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if ( not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None ): raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface and host: raise ValueError("Can not specify both interface and host") if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover if rmm_pool_size is not None: rmm_pool_size = parse_bytes(rmm_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details" ) if enable_nvlink and rmm_managed_memory: raise ValueError( "RMM managed memory and NVLink are currently incompatible." ) # Ensure this parent dask-cuda-worker process uses the same UCX # configuration as child worker processes created by it. initialize( create_cuda_context=False, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=0, ) self.nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=_get_interface(interface, host, i, net_devices), host=host, preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=security, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={ CPUAffinity(get_cpu_affinity(i)), RMMSetup(rmm_pool_size, rmm_managed_memory), }, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": parse_device_memory_limit( device_memory_limit, device_index=i ), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ]
def main( scheduler, host, nthreads, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: result = (BokehWorker, { "prefix": bokeh_prefix }) if bokeh_prefix else BokehWorker services[("bokeh", bokeh_port)] = result rscs = gpu_rscs(ResourcedWorker) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = {k: (float(v) if v else 0.0) for k, v in resources.items()} rscs.update(resources) loop = IOLoop.current() if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\ndask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, 0, 0) if host else None kwargs = {'port': None, "host": addr} if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") name = name or dask.config.get('client-name') or socket.gethostname() nannies = [ ResourcedWorker(scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=rscs, memory_limit=memory_limit, reconnect=reconnect, local_directory=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=None, name=name if nthreads == 1 else name + "-" + str(i), **kwargs) for i in range(1) ] @gen.coroutine def run(): yield [n.start() for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.1) # dask_global.py:global_signal_master() will receive all signal. try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, dashboard, bokeh_port, local_directory, scheduler_file, interface, protocol, death_timeout, preload, preload_argv, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {"worker_port": worker_port, "listen_address": listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, interface=interface, protocol=protocol, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"bokhe": { "prefix": dashboard_prefix }}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__( self, scheduler=None, host=None, nthreads=1, name=None, memory_limit="auto", device_memory_limit="auto", rmm_pool_size=None, rmm_maximum_pool_size=None, rmm_managed_memory=False, rmm_async=False, rmm_log_directory=None, pid_file=None, resources=None, dashboard=True, dashboard_address=":0", local_directory=None, shared_filesystem=None, scheduler_file=None, interface=None, preload=[], dashboard_prefix=None, security=None, enable_tcp_over_ucx=None, enable_infiniband=None, enable_nvlink=None, enable_rdmacm=None, net_devices=None, jit_unspill=None, worker_class=None, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" enable_proctitle_on_current() enable_proctitle_on_children() try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if nthreads < 1: raise ValueError("nthreads must be higher than 0.") memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.pop("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None, **kwargs} if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if isinstance(scheduler, Cluster): scheduler = scheduler.scheduler_address if interface and host: raise ValueError("Can not specify both interface and host") if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm") # pragma: no cover if rmm_async: raise ValueError( "RMM pool and managed memory are incompatible with asynchronous " "allocator") if rmm_pool_size is not None: rmm_pool_size = parse_bytes(rmm_pool_size) if rmm_maximum_pool_size is not None: rmm_maximum_pool_size = parse_bytes(rmm_maximum_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details") if enable_nvlink and rmm_managed_memory: raise ValueError( "RMM managed memory and NVLink are currently incompatible.") if _ucx_111 and net_devices == "auto": warnings.warn( "Starting with UCX 1.11, `ucx_net_devices='auto' is deprecated, " "it should now be left unspecified for the same behavior. " "Please make sure to read the updated UCX Configuration section in " "https://docs.rapids.ai/api/dask-cuda/nightly/ucx.html, " "where significant performance considerations for InfiniBand with " "UCX 1.11 and above is documented.", ) # Ensure this parent dask-cuda-worker process uses the same UCX # configuration as child worker processes created by it. initialize( create_cuda_context=False, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=0, ) if jit_unspill is None: self.jit_unspill = dask.config.get("jit-unspill", default=False) else: self.jit_unspill = jit_unspill if self.jit_unspill: data = lambda i: ( ProxifyHostFile, { "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i), "memory_limit": memory_limit, "local_directory": local_directory, "shared_filesystem": shared_filesystem, }, ) else: data = lambda i: ( DeviceHostFile, { "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i), "memory_limit": memory_limit, "local_directory": local_directory, }, ) self.nannies = [ Nanny( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, dashboard=dashboard, dashboard_address=dashboard_address, http_prefix=dashboard_prefix, loop=loop, resources=resources, memory_limit=memory_limit, interface=_get_interface(interface, host, i, net_devices), host=host, preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=security, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={ CPUAffinity( get_cpu_affinity( nvml_device_index(i, cuda_visible_devices(i)))), RMMSetup( rmm_pool_size, rmm_maximum_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory, ), }, name=name if nprocs == 1 or name is None else str(name) + "-" + str(i), local_directory=local_directory, config={ "distributed.comm.ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=data(nvml_device_index(i, cuda_visible_devices(i))), worker_class=worker_class, **kwargs, ) for i in range(nprocs) ]
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error("Failed to launch worker. You cannot use the --port argument when nprocs > 1.") exit(1) if nprocs > 1 and not nanny: logger.error("Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1.") exit(1) if contact_address and not listen_address: logger.error("Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given.") exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file and 'scheduler-address' not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, 's') nannies = [t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, name=name if nprocs == 1 or not name else name + '-' + str(i), **kwargs) for i in range(nprocs)] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, host=host, preload=(preload or []) + ["dask_cuda.initialize_context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit( memory_limit, nthreads, total_cores=nprocs ), "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, pid_file, resources, dashboard, bokeh, bokeh_port, scheduler_file, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, worker_class, preload_nanny, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh sec = { k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_worker_cert", tls_cert), ("tls_worker_key", tls_key), ] if v is not None } if nprocs < 0: nprocs = CPU_COUNT + 1 + nprocs if nprocs <= 0: logger.error( "Failed to launch worker. Must specify --nprocs so that there's at least one process." ) sys.exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) sys.exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") sys.exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") sys.exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) sys.exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) sys.exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = CPU_COUNT // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() worker_class = import_term(worker_class) if nanny: kwargs["worker_class"] = worker_class kwargs["preload_nanny"] = preload_nanny if nanny: kwargs.update({ "worker_port": worker_port, "listen_address": listen_address }) t = Nanny else: if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = worker_class if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") with suppress(TypeError, ValueError): name = int(name) if "DASK_INTERNAL_INHERIT_CONFIG" in os.environ: config = deserialize_for_cli( os.environ["DASK_INTERNAL_INHERIT_CONFIG"]) # Update the global config given priority to the existing global config dask.config.update(dask.config.global_config, config, priority="old") nannies = [ t(scheduler, scheduler_file=scheduler_file, nthreads=nthreads, loop=loop, resources=resources, security=sec, contact_address=contact_address, host=host, port=port, dashboard=dashboard, dashboard_address=dashboard_address, name=name if nprocs == 1 or name is None or name == "" else str(name) + "-" + str(i), **kwargs) for i in range(nprocs) ] async def close_all(): # Unregister all workers from scheduler if nanny: await asyncio.gather(*[n.close(timeout=2) for n in nannies]) signal_fired = False def on_signal(signum): nonlocal signal_fired signal_fired = True if signum != signal.SIGINT: logger.info("Exiting on signal %d", signum) return asyncio.ensure_future(close_all()) async def run(): await asyncio.gather(*nannies) await asyncio.gather(*[n.finished() for n in nannies]) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except TimeoutError: # We already log the exception in nanny / worker. Don't do it again. if not signal_fired: logger.info("Timed out starting worker") sys.exit(1) except KeyboardInterrupt: pass finally: logger.info("End worker")
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, {'prefix': bokeh_prefix}) except ImportError as error: if str(error).startswith('No module named'): logger.info('Web dashboard not loaded. Unable to import bokeh') else: logger.info('Unable to import bokeh: %s' % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)