def worker(nthreads=None, memory_limit=None): # pragma: nocover enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() worker = Nanny(scheduler, loop=loop, memory_limit=memory_limit, worker_port=0, nthreads=nthreads) async def cleanup(): await worker.close(timeout=2) install_signal_handlers(loop, cleanup=cleanup) async def run(): await worker await worker.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def scheduler(argv=None): args = scheduler_parser.parse_args(argv) gateway = make_gateway_client() security = make_security() loop = IOLoop.current() install_signal_handlers(loop) enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) async def run(): scheduler = await start_scheduler(gateway, security, adaptive_period=args.adaptive_period) await scheduler.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
async def start_worker( gateway, security, worker_name, nthreads=1, memory_limit="auto", local_directory="", nanny=True, ): loop = IOLoop.current() scheduler = await gateway.get_scheduler_address() typ = Nanny if nanny else Worker worker = typ( scheduler, loop=loop, nthreads=nthreads, memory_limit=memory_limit, security=security, name=worker_name, local_directory=local_directory, ) if nanny: async def close(signalnum): await worker.close(timeout=2) install_signal_handlers(loop, cleanup=close) await worker return worker
def worker(nthreads=None, memory_limit=None): enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 2**20) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() worker = Nanny(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit, worker_port=0) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def scheduler(): # pragma: nocover app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = 'tcp://' loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): try: from distributed.dashboard.scheduler import BokehScheduler except ImportError: # Old import location from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) # Set dask.dashboard before dask.scheduler since the YarnCluster object # waits on dask.scheduler only if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() app_client.kv['dask.scheduler'] = scheduler.address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def start_worker(nthreads=None, memory_limit=None): enable_proctitle_on_current() enable_proctitle_on_children() if memory_limit is None: memory_limit = int(skein.properties.container_resources.memory * 1e6) if nthreads is None: nthreads = skein.properties.container_resources.vcores app_client = skein.ApplicationClient.from_current() scheduler = app_client.kv.wait('dask.scheduler').decode() loop = IOLoop.current() # Until the config patch is merged, we can't use the nanny process since # there's no way to monkey patch config inside the forkserver process if hasattr(dask.config, 'PATH'): worker = Nanny(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit, worker_port=0) @gen.coroutine def close(signalnum): worker._close(timeout=2) install_signal_handlers(loop, cleanup=close) else: worker = Worker(scheduler, ncores=nthreads, loop=loop, memory_limit=memory_limit) @gen.coroutine def run(): yield worker._start(None) while worker.status != 'closed': yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass
def main(): app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) addr = uri_from_host_port('', None, 0) loop = IOLoop.current() services = {} bokeh = False with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', 0)] = (BokehScheduler, {}) bokeh = True scheduler = Scheduler(loop=loop, services=services) scheduler.start(addr) install_signal_handlers(loop) app_client.kv['dask.scheduler'] = scheduler.address.encode() if bokeh: bokeh_port = scheduler.services['bokeh'].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port) app_client.kv['dask.dashboard'] = bokeh_address.encode() try: loop.start() loop.close() finally: scheduler.stop()
def scheduler(argv=None): scheduler_parser.parse_args(argv) gateway = make_gateway_client() security = make_security() loop = IOLoop.current() install_signal_handlers(loop) enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop.add_callback(start_scheduler, gateway, security) loop.start()
def scheduler(): # pragma: nocover app_client = skein.ApplicationClient.from_current() enable_proctitle_on_current() enable_proctitle_on_children() if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() scheduler = Scheduler(loop=loop, dashboard_address=("", 0)) install_signal_handlers(loop) def post_addresses(): # Set dask.dashboard before dask.scheduler since the YarnCluster object # waits on dask.scheduler only if "dashboard" in scheduler.services: bokeh_port = scheduler.services["dashboard"].port bokeh_host = urlparse(scheduler.address).hostname bokeh_address = "http://%s:%d" % (bokeh_host, bokeh_port) app_client.kv["dask.dashboard"] = bokeh_address.encode() app_client.kv["dask.scheduler"] = scheduler.address.encode() async def run(): await scheduler await loop.run_in_executor(None, post_addresses) await scheduler.finished() try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: scheduler.stop()
def main(host, port, bokeh_port, show, dashboard, bokeh, dashboard_prefix, use_xheaders, pid_file, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = { k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_scheduler_cert", tls_cert), ("tls_scheduler_key", tls_key), ] if v is not None } if "DASK_INTERNAL_INHERIT_CONFIG" in os.environ: config = deserialize_for_cli( os.environ["DASK_INTERNAL_INHERIT_CONFIG"]) # Update the global config given priority to the existing global config dask.config.update(dask.config.global_config, config, priority="old") if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler(loop=loop, security=sec, host=host, port=port, dashboard=dashboard, dashboard_address=dashboard_address, http_prefix=dashboard_prefix, **kwargs) logger.info("-" * 47) install_signal_handlers(loop) async def run(): await scheduler await scheduler.finished() try: loop.run_sync(run) finally: scheduler.stop() logger.info("End scheduler at %r", scheduler.address)
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): logger = SchedulerLogger.getLogger() enable_proctitle_on_current() enable_proctitle_on_children() log_metrics = EdasEnv.getBool("log.metrics", False) logger.info(f"Log Metrics: {log_metrics}") plugins = [EDASSchedulerPlugin(logger)] if log_metrics else [] sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, { 'prefix': bokeh_prefix }) except ImportError as error: if str(error).startswith('No module named'): logger.info( 'Web dashboard not loaded. Unable to import bokeh') else: logger.info('Unable to import bokeh: %s' % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) for plugin in plugins: logger.info(f"@SP: Adding scheduler plugin: {plugin}") scheduler.add_plugin(plugin) scheduler.start(addr) comm = Comm(scheduler) comm.start() if not preload: preload = dask.config.get('distributed.scheduler.preload', {}) if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv', {}) preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) def shutdown_scheduler(): comm.terminate() scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr) def close_loop(): loop.stop() loop.close() shutdown_scheduler() atexit.register(close_loop) try: loop.start() loop.close() finally: shutdown_scheduler()
def go(): install_signal_handlers() check_python_3() main()
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, { 'prefix': bokeh_prefix }) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, host=host, preload=(preload or []) + ["dask_cuda.initialize_context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit( memory_limit, nthreads, total_cores=nprocs ), "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( host, port, bokeh_port, show, dashboard, dashboard_prefix, use_xheaders, pid_file, scheduler_file, interface, protocol, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler( loop=loop, scheduler_file=scheduler_file, security=sec, host=host, port=port, interface=interface, protocol=protocol, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": { "prefix": dashboard_prefix }}, ) scheduler.start() if not preload: preload = dask.config.get("distributed.scheduler.preload") if not preload_argv: preload_argv = dask.config.get("distributed.scheduler.preload-argv") preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", scheduler.address)
def main( scheduler, host, nthreads, name, memory_limit, pid_file, reconnect, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, _ncores // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host: addr = uri_from_host_port(host, 0, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=(preload or []) + ["dask_cuda.initialize_context"], preload_argv=preload_argv, security=sec, contact_address=None, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( host, port, bokeh_port, show, dashboard, bokeh, dashboard_prefix, use_xheaders, pid_file, local_directory, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs ): g0, g1, g2 = gc.get_threshold() # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port ) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh if port is None and (not host or not re.search(r":\d", host)): port = 8786 sec = Security( **{ k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_scheduler_cert", tls_cert), ("tls_scheduler_key", tls_key), ] if v is not None } ) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) loop = IOLoop.current() logger.info("-" * 47) scheduler = Scheduler( loop=loop, security=sec, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": {"prefix": dashboard_prefix}}, **kwargs, ) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) async def run(): await scheduler await scheduler.finished() try: loop.run_sync(run) finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", scheduler.address)
def go(): install_signal_handlers() check_python_3() main()
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, pid_file, resources, dashboard, bokeh, bokeh_port, scheduler_file, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, worker_class, preload_nanny, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh sec = { k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_worker_cert", tls_cert), ("tls_worker_key", tls_key), ] if v is not None } if nprocs < 0: nprocs = CPU_COUNT + 1 + nprocs if nprocs <= 0: logger.error( "Failed to launch worker. Must specify --nprocs so that there's at least one process." ) sys.exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) sys.exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") sys.exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") sys.exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) sys.exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) sys.exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = CPU_COUNT // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() worker_class = import_term(worker_class) if nanny: kwargs["worker_class"] = worker_class kwargs["preload_nanny"] = preload_nanny if nanny: kwargs.update({ "worker_port": worker_port, "listen_address": listen_address }) t = Nanny else: if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = worker_class if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") with suppress(TypeError, ValueError): name = int(name) if "DASK_INTERNAL_INHERIT_CONFIG" in os.environ: config = deserialize_for_cli( os.environ["DASK_INTERNAL_INHERIT_CONFIG"]) # Update the global config given priority to the existing global config dask.config.update(dask.config.global_config, config, priority="old") nannies = [ t(scheduler, scheduler_file=scheduler_file, nthreads=nthreads, loop=loop, resources=resources, security=sec, contact_address=contact_address, host=host, port=port, dashboard=dashboard, dashboard_address=dashboard_address, name=name if nprocs == 1 or name is None or name == "" else str(name) + "-" + str(i), **kwargs) for i in range(nprocs) ] async def close_all(): # Unregister all workers from scheduler if nanny: await asyncio.gather(*[n.close(timeout=2) for n in nannies]) signal_fired = False def on_signal(signum): nonlocal signal_fired signal_fired = True if signum != signal.SIGINT: logger.info("Exiting on signal %d", signum) return asyncio.ensure_future(close_all()) async def run(): await asyncio.gather(*nannies) await asyncio.gather(*[n.finished() for n in nannies]) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except TimeoutError: # We already log the exception in nanny / worker. Don't do it again. if not signal_fired: logger.info("Timed out starting worker") sys.exit(1) except KeyboardInterrupt: pass finally: logger.info("End worker")
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if not host and (tls_ca_file or tls_cert or tls_key): host = 'tls://' if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = (BokehScheduler, {'prefix': bokeh_prefix}) except ImportError as error: if str(error).startswith('No module named'): logger.info('Web dashboard not loaded. Unable to import bokeh') else: logger.info('Unable to import bokeh: %s' % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get('distributed.scheduler.preload') if not preload_argv: preload_argv = dask.config.get('distributed.scheduler.preload-argv') preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
import asyncio from tornado.ioloop import IOLoop from distributed.cli.utils import install_signal_handlers loop = IOLoop.current() install_signal_handlers(loop) class _AsyncTimedIterator: __slots__ = ("_iterator", "_timeout", "_sentinel") def __init__(self, iterable, timeout): self._iterator = iterable.__aiter__() self._timeout = timeout async def __anext__(self): return await asyncio.wait_for(self._iterator.__anext__(), self._timeout) class AsyncTimedIterable: """Wrapper for an AsyncIterable that adds a timeout See https://stackoverflow.com/a/50245879/1003288 """ __slots__ = ("_factory", )
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error("Failed to launch worker. You cannot use the --port argument when nprocs > 1.") exit(1) if nprocs > 1 and not nanny: logger.error("Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1.") exit(1) if contact_address and not listen_address: logger.error("Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given.") exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file and 'scheduler-address' not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, 's') nannies = [t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, name=name if nprocs == 1 or not name else name + '-' + str(i), **kwargs) for i in range(nprocs)] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key, dashboard_address, ): enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key) if not host and (tls_ca_file or tls_cert or tls_key): host = "tls://" if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix="scheduler-") local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith("linux"): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info("-" * 47) services = {} if _bokeh: try: from distributed.bokeh.scheduler import BokehScheduler services[("bokeh", dashboard_address)] = ( BokehScheduler, { "prefix": bokeh_prefix }, ) except ImportError as error: if str(error).startswith("No module named"): logger.info( "Web dashboard not loaded. Unable to import bokeh") else: logger.info("Unable to import bokeh: %s" % str(error)) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) if not preload: preload = dask.config.get("distributed.scheduler.preload") if not preload_argv: preload_argv = dask.config.get("distributed.scheduler.preload-argv") preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv) logger.info("Local Directory: %26s", local_directory) logger.info("-" * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def run_cluster(): description = 'Run a Dask Slurm cluster' parser = argparse.ArgumentParser(description=description) parser.add_argument('-q', '--queue', dest='queue', default=DEFAULT_QUEUE, type=str, help=(f'the Slurm queue to submit to. ' f'Default: {DEFAULT_QUEUE}')) parser.add_argument('-c', '--cores', dest='cores', type=int, default=DEFAULT_NUM_CORES, help=(f'the number of cores to use per job. ' f'Default: {DEFAULT_NUM_CORES}')) parser.add_argument('-m', '--memory', dest='memory', type=str, default=DEFAULT_MEMORY, help=(f'the amount of memory to use per job. ' f'Default: {DEFAULT_MEMORY}')) parser.add_argument('--minimum-workers', dest='minimum_workers', type=int, default=DEFAULT_MINIMUM_WORKERS, help=(f'the minimum number of workers to scale the ' f'cluster down to in the autoscale mode. ' f'Default: {DEFAULT_MINIMUM_WORKERS}')) parser.add_argument('--maximum-workers', dest='maximum_workers', type=int, default=DEFAULT_MAXIMUM_WORKERS, help=(f'the maximum number of workers to scale the ' f'cluster up to in the autoscale mode. ' f'Default: {DEFAULT_MAXIMUM_WORKERS}')) parser.add_argument('--address', dest='address', type=str, default=DEFAULT_ADDRESS, help=(f"the network address to be assigned to the " f"cluster's scheduler. " f'Default: {DEFAULT_ADDRESS}')) parser.add_argument('--port', dest='port', type=str, default=DEFAULT_PORT, help=(f"the network port to be assigned to the " f"cluster's scheduler. " f'Default: {DEFAULT_PORT}')) args = parser.parse_args() cluster = dask_slurm_cluster(**args.__dict__) # Try to use dask's "distributed" logger. logger = logging.getLogger('distributed') logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) logger.info(f'Starting cluster {cluster.__repr__()}\n{args.__dict__}') loop = IOLoop.current() install_signal_handlers(loop) async def run(): await cluster await cluster.scheduler.finished() try: loop.run_sync(run) finally: logger.info(f'End cluster {cluster.__repr__()}') cluster.close()
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() if tls_ca_file and tls_cert and tls_key: sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) else: sec = None try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover rmm_pool_size = parse_bytes(rmm_pool_size) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=get_ucx_net_devices( cuda_device_index=i, ucx_net_devices=net_devices, get_openfabrics=False, get_network=True, ), preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)}, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, rmm_managed_memory, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): if tls_ca_file and tls_cert and tls_key: security = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) else: security = None worker = CUDAWorker( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, rmm_managed_memory, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, security, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ) async def on_signal(signum): logger.info("Exiting on signal %d", signum) await worker.close() async def run(): await worker await worker.finished() loop = IOLoop.current() install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, pid_file, resources, dashboard, bokeh, bokeh_port, scheduler_file, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, **kwargs): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port if bokeh is not None: warnings.warn( "The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. " ) dashboard = bokeh sec = Security( **{ k: v for k, v in [ ("tls_ca_file", tls_ca_file), ("tls_worker_cert", tls_cert), ("tls_worker_key", tls_key), ] if v is not None }) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = CPU_COUNT // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs.update({ "worker_port": worker_port, "listen_address": listen_address }) t = Nanny else: if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") with ignoring(TypeError, ValueError): name = int(name) nannies = [ t(scheduler, scheduler_file=scheduler_file, nthreads=nthreads, loop=loop, resources=resources, security=sec, contact_address=contact_address, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"dashboard": { "prefix": dashboard_prefix }}, name=name if nprocs == 1 or name is None or name == "" else str(name) + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except TimeoutError: # We already log the exception in nanny / worker. Don't do it again. raise TimeoutError("Timed out starting worker.") from None except KeyboardInterrupt: pass finally: logger.info("End worker")
def main(host, port, bokeh_port, bokeh_internal_port, show, _bokeh, bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file, interface, local_directory, preload, prefix, tls_ca_file, tls_cert, tls_key): if bokeh_internal_port: print("The --bokeh-internal-port keyword has been removed.\n" "The internal bokeh server is now the default bokeh server.\n" "Use --bokeh-port %d instead" % bokeh_internal_port) sys.exit(1) if prefix: print("The --prefix keyword has moved to --bokeh-prefix") sys.exit(1) sec = Security( tls_ca_file=tls_ca_file, tls_scheduler_cert=tls_cert, tls_scheduler_key=tls_key, ) if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) local_directory_created = False if local_directory: if not os.path.exists(local_directory): os.mkdir(local_directory) local_directory_created = True else: local_directory = tempfile.mkdtemp(prefix='scheduler-') local_directory_created = True if local_directory not in sys.path: sys.path.insert(0, local_directory) if sys.platform.startswith('linux'): import resource # module fails importing on Windows soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) limit = max(soft, hard // 2) resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard)) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) addr = uri_from_host_port(host, port, 8786) loop = IOLoop.current() logger.info('-' * 47) services = {} if _bokeh: with ignoring(ImportError): from distributed.bokeh.scheduler import BokehScheduler services[('bokeh', bokeh_port)] = partial(BokehScheduler, prefix=bokeh_prefix) scheduler = Scheduler(loop=loop, services=services, scheduler_file=scheduler_file, security=sec) scheduler.start(addr) preload_modules(preload, parameter=scheduler, file_dir=local_directory) logger.info('Local Directory: %26s', local_directory) logger.info('-' * 47) install_signal_handlers(loop) try: loop.start() loop.close() finally: scheduler.stop() if local_directory_created: shutil.rmtree(local_directory) logger.info("End scheduler at %r", addr)
def main( scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, dashboard, bokeh_port, local_directory, scheduler_file, interface, protocol, death_timeout, preload, preload_argv, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {"worker_port": worker_port, "listen_address": listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, interface=interface, protocol=protocol, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"bokhe": { "prefix": dashboard_prefix }}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, rmm_managed_memory, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, enable_jit_unspill, **kwargs, ): if tls_ca_file and tls_cert and tls_key: security = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) else: security = None if isinstance(scheduler, str) and scheduler.startswith("-"): raise ValueError( "The scheduler address can't start with '-'. Please check " "your command line arguments, you probably attempted to use " "unsupported one. Scheduler address: %s" % scheduler ) worker = CUDAWorker( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, rmm_managed_memory, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, security, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, enable_jit_unspill, **kwargs, ) async def on_signal(signum): logger.info("Exiting on signal %d", signum) await worker.close() async def run(): await worker await worker.finished() loop = IOLoop.current() install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, contact_address=contact_address, **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n.start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def main( fargate, fargate_scheduler, fargate_workers, image, scheduler_cpu, scheduler_mem, scheduler_timeout, worker_cpu, worker_mem, n_workers, cluster_arn, cluster_name_template, execution_role_arn, task_role_arn, task_role_policy, cloudwatch_logs_group, cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention, vpc, subnet, security_group, environment, tag, skip_cleanup, ): tag = {v.split("=")[0]: v.split("=")[1] for v in tag} if tag else None environment = ({v.split("=")[0]: v.split("=")[1] for v in environment} if environment else None) subnet = subnet or None security_group = security_group or None task_role_policy = task_role_policy or None logger.info("Starting ECS cluster") try: cluster = ECSCluster( fargate_scheduler=fargate_scheduler or fargate, fargate_workers=fargate_workers or fargate, image=image, scheduler_cpu=scheduler_cpu, scheduler_mem=scheduler_mem, scheduler_timeout=scheduler_timeout, worker_cpu=worker_cpu, worker_mem=worker_mem, n_workers=n_workers, cluster_arn=cluster_arn, cluster_name_template=cluster_name_template, execution_role_arn=execution_role_arn, task_role_arn=task_role_arn, task_role_policies=task_role_policy, cloudwatch_logs_group=cloudwatch_logs_group, cloudwatch_logs_stream_prefix=cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention=cloudwatch_logs_default_retention, vpc=vpc, subnets=subnet, security_groups=security_group, environment=environment, tags=tag, skip_cleanup=skip_cleanup, ) except Exception as e: ctx = click.get_current_context() logger.error(str(e) + "\n") click.echo(ctx.get_help()) sys.exit(1) async def run(): logger.info("Ready") while cluster.status != "closed": await sleep(0.2) def on_signal(signum): logger.info("Exiting on signal %d", signum) cluster.close(timeout=2) loop = IOLoop.current() install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): logger.info("Shutting down") finally: logger.info("End dask-ecs")