def test_workdir_simple(tmpdir): # Test nominal operation of WorkSpace and WorkDirs base_dir = str(tmpdir) assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name='aa') assert_contents(['aa', 'aa.dirlock']) b = ws.new_work_dir(name='bb') assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock']) ws._purge_leftovers() assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock']) a.release() assert_contents(['bb', 'bb.dirlock']) del b gc.collect() assert_contents([]) # Generated temporary name with a prefix a = ws.new_work_dir(prefix='foo-') b = ws.new_work_dir(prefix='bar-') c = ws.new_work_dir(prefix='bar-') assert_contents({a.dir_path, a._lock_path, b.dir_path, b._lock_path, c.dir_path, c._lock_path}) assert os.path.basename(a.dir_path).startswith('foo-') assert os.path.basename(b.dir_path).startswith('bar-') assert os.path.basename(c.dir_path).startswith('bar-') assert b.dir_path != c.dir_path
def _test_workspace_concurrency(tmpdir, timeout, max_procs): """ WorkSpace concurrency test. We merely check that no exception or deadlock happens. """ base_dir = str(tmpdir) err_q = mp_context.Queue() purged_q = mp_context.Queue() stop_evt = mp_context.Event() ws = WorkSpace(base_dir) # Make sure purging only happens in the child processes ws._purge_leftovers = lambda: None # Run a bunch of child processes that will try to purge concurrently NPROCS = 2 if sys.platform == 'win32' else max_procs processes = [mp_context.Process(target=_workspace_concurrency, args=(base_dir, purged_q, err_q, stop_evt)) for i in range(NPROCS)] for p in processes: p.start() n_created = 0 n_purged = 0 try: t1 = time() while time() - t1 < timeout: # Add a bunch of locks, and simulate forgetting them. # The concurrent processes should try to purge them. for i in range(50): d = ws.new_work_dir(prefix='workspace-concurrency-') d._finalizer.detach() n_created += 1 sleep(1e-2) finally: stop_evt.set() for p in processes: p.join() # Any errors? try: err = err_q.get_nowait() except Empty: pass else: raise err try: while True: n_purged += purged_q.get_nowait() except Empty: pass # We attempted to purge most directories at some point assert n_purged >= 0.5 * n_created > 0 return n_created, n_purged
def test_workspace_rmtree_failure(tmpdir): base_dir = str(tmpdir) ws = WorkSpace(base_dir) a = ws.new_work_dir(name='aa') shutil.rmtree(a.dir_path) with captured_logger('distributed.diskutils', 'ERROR', propagate=False) as sio: a.release() lines = sio.getvalue().splitlines() # shutil.rmtree() may call its onerror callback several times assert lines for line in lines: assert line.startswith("Failed to remove %r" % (a.dir_path,))
def test_locking_disabled(tmpdir): base_dir = str(tmpdir) with dask.config.set({'distributed.worker.use-file-locking': False}): with mock.patch('distributed.diskutils.locket.lock_file') as lock_file: assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name='aa') assert_contents(['aa']) b = ws.new_work_dir(name='bb') assert_contents(['aa', 'bb']) ws._purge_leftovers() assert_contents(['aa', 'bb']) a.release() assert_contents(['bb']) del b gc.collect() assert_contents([]) lock_file.assert_not_called()
def test_two_workspaces_in_same_directory(tmpdir): # If handling the same directory with two WorkSpace instances, # things should work ok too base_dir = str(tmpdir) assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name="aa") assert_contents(["aa", "aa.dirlock"]) ws2 = WorkSpace(base_dir) ws2._purge_leftovers() assert_contents(["aa", "aa.dirlock"]) b = ws.new_work_dir(name="bb") assert_contents(["aa", "aa.dirlock", "bb", "bb.dirlock"]) del ws del b gc.collect() assert_contents(["aa", "aa.dirlock"], trials=5) del a gc.collect() assert_contents([], trials=5)
def test_locking_disabled(tmpdir): base_dir = str(tmpdir) with dask.config.set({"distributed.worker.use-file-locking": False}): with mock.patch("distributed.diskutils.locket.lock_file") as lock_file: assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name="aa") assert_contents(["aa"]) b = ws.new_work_dir(name="bb") assert_contents(["aa", "bb"]) ws._purge_leftovers() assert_contents(["aa", "bb"]) a.release() assert_contents(["bb"]) del b gc.collect() assert_contents([]) lock_file.assert_not_called()
def test_locking_disabled(tmpdir): base_dir = str(tmpdir) with new_config({'use-file-locking': False}): with mock.patch('distributed.diskutils.locket.lock_file') as lock_file: assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name='aa') assert_contents(['aa']) b = ws.new_work_dir(name='bb') assert_contents(['aa', 'bb']) ws._purge_leftovers() assert_contents(['aa', 'bb']) a.release() assert_contents(['bb']) del b gc.collect() assert_contents([]) lock_file.assert_not_called()
def test_two_workspaces_in_same_directory(tmpdir): # If handling the same directory with two WorkSpace instances, # things should work ok too base_dir = str(tmpdir) assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) assert_contents([]) a = ws.new_work_dir(name='aa') assert_contents(['aa', 'aa.dirlock']) ws2 = WorkSpace(base_dir) ws2._purge_leftovers() assert_contents(['aa', 'aa.dirlock']) b = ws.new_work_dir(name='bb') assert_contents(['aa', 'aa.dirlock', 'bb', 'bb.dirlock']) del ws del b gc.collect() assert_contents(['aa', 'aa.dirlock']) del a gc.collect() assert_contents([])
def test_workspace_concurrency(tmpdir): """WorkSpace concurrency test. We merely check that no exception or deadlock happens. """ base_dir = str(tmpdir) err_q = mp_context.Queue() purged_q = mp_context.Queue() stop_evt = mp_context.Event() ws = WorkSpace(base_dir) # Make sure purging only happens in the child processes ws._purge_leftovers = lambda: None # Windows (or at least Windows GitHub CI) has been observed to be exceptionally # slow. Don't stress it too much. max_procs = 2 if WINDOWS else 16 # Run a bunch of child processes that will try to purge concurrently barrier = mp_context.Barrier(parties=max_procs + 1) processes = [ mp_context.Process( target=_workspace_concurrency, args=(base_dir, purged_q, err_q, stop_evt, barrier), ) for _ in range(max_procs) ] for p in processes: p.start() barrier.wait() n_created = 0 n_purged = 0 t1 = time() try: # On Linux, you will typically end with n_created > 10.000 # On Windows, it can take 60 seconds to create 50 locks! while time() - t1 < 10: # Add a bunch of locks and simulate forgetting them. # The concurrent processes should try to purge them. for _ in range(100): d = ws.new_work_dir(prefix="workspace-concurrency-") d._finalizer.detach() n_created += 1 finally: stop_evt.set() for p in processes: p.join() # Any errors? try: err = err_q.get_nowait() except queue.Empty: pass else: raise err try: while True: n_purged += purged_q.get_nowait() except queue.Empty: pass # We attempted to purge most directories at some point assert n_purged >= 0.5 * n_created > 0
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, reconnect, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host: addr = uri_from_host_port(host, 0, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") local_dir = kwargs.get("local_dir", "dask-worker-space") with warn_on_duration( "1s", "Creating scratch directories is taking a surprisingly long time. " "This is often due to running workers on a network file system. " "Consider specifying a local-directory to point workers to write " "scratch data to a local disk.", ): _workspace = WorkSpace(os.path.abspath(local_dir)) _workdir = _workspace.new_work_dir(prefix="worker-") local_dir = _workdir.dir_path nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=(preload or []) + ["dask_cuda.initialize_context"], preload_argv=preload_argv, security=sec, contact_address=None, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit(memory_limit, nthreads, total_cores=nprocs), "local_dir": local_dir, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")