def test_environment_variable(c, s): a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"}) b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"}) yield [a._start(), b._start()] results = yield c.run(lambda: os.environ['FOO']) assert results == {a.worker_address: "123", b.worker_address: "456"} yield [a._close(), b._close()]
def test_environment_variable(c, s): a = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "123"}) b = Nanny(s.address, loop=s.loop, memory_limit=0, env={"FOO": "456"}) yield [a, b] results = yield c.run(lambda: os.environ['FOO']) assert results == {a.worker_address: "123", b.worker_address: "456"} yield [a._close(), b._close()]
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start(0) with rpc(ip=n.ip, port=n.port) as nn: assert isalive(n.process) # alive assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert not n.process assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.instantiate() assert isalive(n.process) assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.process yield n._close()
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start(0) with rpc(n.address) as nn: assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.workers[n.worker_address].services['nanny'] > 1024 yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.workers yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.workers yield nn.instantiate() assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.workers[n.worker_address].services['nanny'] > 1024 yield nn.terminate() assert not n.is_alive() yield n._close()
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random() / 2) with ignoring( CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01, key=['f%d' % i for i in range(20)]) yield wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[n_worker_address]) while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n._close()
def test_broken_worker_during_computation(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def test_many_kills(s): n = Nanny(s.address, ncores=2, loop=s.loop) yield n._start(0) assert n.is_alive() yield [n.kill() for i in range(5)] yield [n.kill() for i in range(5)] yield n._close()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center(ip='127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert isinstance(d['timestamp'], datetime) stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) stream.close() yield n._close() c.stop()
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start(0) nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.kill() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info assert not n.process yield nn.instantiate() assert n.process.is_alive() assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.process yield n._close()
def test_nanny(s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start(0) with rpc(n.address) as nn: assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.kill() assert not n.is_alive() assert n.worker_address not in s.ncores assert n.worker_address not in s.worker_info yield nn.instantiate() assert n.is_alive() assert s.ncores[n.worker_address] == 2 assert s.worker_info[n.worker_address]['services']['nanny'] > 1024 yield nn.terminate() assert not n.is_alive() yield n._close()
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01) yield _wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address n.process.terminate() while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n._close()
def test_nanny(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start(0) nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.instantiate() assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.terminate() assert not n.process if n.process: n.process.terminate() yield n._close() c.stop()
def test_scheduler_file(): with tmpfile() as fn: s = Scheduler(scheduler_file=fn) s.start(8008) w = Nanny(scheduler_file=fn) yield w._start() assert s.workers == {w.worker_address} yield w._close() s.stop()
def test_scheduler_file(): with tmpfile() as fn: s = Scheduler(scheduler_file=fn) s.start(8008) w = Nanny(scheduler_file=fn) yield w._start() assert set(s.workers) == {w.worker_address} yield w._close() s.stop()
def create_and_destroy_worker(delay): start = time() while time() < start + 5: n = Nanny(s.address, ncores=2, loop=s.loop) n.start(0) yield gen.sleep(delay) yield n._close() print("Killed nanny")
def test_restart(): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield [a._start(), b._start()] e = Executor((c.ip, c.port), start=False, loop=IOLoop.current()) yield e._start() assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2} x = e.submit(inc, 1) y = e.submit(inc, x) yield y._result() cc = rpc(ip=c.ip, port=c.port) who_has = yield cc.who_has() try: assert e.scheduler.who_has == who_has assert set(e.scheduler.who_has) == {x.key, y.key} f = yield e._restart() assert f is e assert len(e.scheduler.stacks) == 2 assert len(e.scheduler.processing) == 2 who_has = yield cc.who_has() assert not who_has assert not e.scheduler.who_has assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop()
def test_worker_uses_same_host_as_nanny(c, s): for host in ['tcp://0.0.0.0', 'tcp://127.0.0.2']: n = Nanny(s.address) yield n._start(host) def func(dask_worker): return dask_worker.listener.listen_address result = yield c.run(func) assert host in first(result.values()) yield n._close()
def test_run(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() with rpc(n.address) as nn: response = yield nn.run(function=dumps(lambda: 1)) assert response['status'] == 'OK' assert response['result'] == 1 yield n._close()
def test_run(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() with rpc(n.address) as nn: response = yield nn.run(function=dumps(lambda: 1)) assert response['status'] == 'OK' assert response['result'] == 1 yield n._close()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == { ('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2 } assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()
def test_scheduler_address_config(c, s): with dask.config.set({'scheduler-address': s.address}): nanny = Nanny(loop=s.loop) yield nanny._start() assert nanny.scheduler.address == s.address start = time() while not s.workers: yield gen.sleep(0.1) assert time() < start + 10 yield nanny._close()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == {('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2} assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()
def test_scheduler_address_config(c, s): config['scheduler-address'] = s.address try: nanny = Nanny(loop=s.loop) yield nanny._start() assert nanny.scheduler.address == s.address start = time() while not s.workers: yield gen.sleep(0.1) assert time() < start + 10 finally: del config['scheduler-address'] yield nanny._close()
def test_num_fds(s): psutil = pytest.importorskip('psutil') proc = psutil.Process() # Warm up w = Nanny(s.address) yield w._start() yield w._close() del w gc.collect() before = proc.num_fds() for i in range(3): w = Nanny(s.address) yield w._start() yield gen.sleep(0.1) yield w._close() start = time() while proc.num_fds() > before: print("fds:", before, proc.num_fds()) yield gen.sleep(0.1) assert time() < start + 10
def test_submit_after_failed_worker_async(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) while len(s.workers) < 3: yield gen.sleep(0.1) L = c.map(inc, range(10)) yield wait(L) s.loop.add_callback(n.kill) total = c.submit(sum, L) result = yield total assert result == sum(map(inc, range(10))) yield n._close()
def test_avoid_memory_monitor_if_zero_limit(c, s): nanny = Nanny(s.address, loop=s.loop, memory_limit=0) yield nanny._start() typ = yield c.run(lambda dask_worker: type(dask_worker.data)) assert typ == {nanny.worker_address: dict} pcs = yield c.run(lambda dask_worker: list(dask_worker.periodic_callbacks)) assert 'memory' not in pcs assert 'memory' not in nanny.periodic_callbacks future = c.submit(inc, 1) assert (yield future) == 2 yield gen.sleep(0.02) yield c.submit(inc, 2) # worker doesn't pause yield nanny._close()
def test_nanny_process_failure(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() c.stop()
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_address = n.worker_address ww = rpc(n.worker_address) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) pid = n.pid assert pid is not None with ignoring(CommClosedError): yield c._run(os._exit, 0, workers=[n.worker_address]) start = time() while n.pid == pid: # wait while process dies and comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 # assert n.worker_address != original_address # most likely start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not n.process.poll() is None: # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() s.stop()
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) close(stream) yield n._close() s.stop()
def test_monitor_resources(s): pytest.importorskip('psutil') n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() assert isalive(n.process) d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert 'timestamp' in d comm = yield connect(n.address) yield comm.write({'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield comm.read() assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) yield comm.close() yield n._close() s.stop()
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=['inc-%d-%d' % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map(slowadd, *zip(*partition_all(2, L)), key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)]) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring( CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n._close()
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield c._run(sys.exit, 0, workers=[n.worker_address]) start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=['inc-%d-%d' % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map(slowadd, *zip(*partition_all(2, L)), key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)]) yield gen.sleep(random.random() / 20) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random.random() / 20) while len(s.workers) < 3: yield gen.sleep(0.01) with ignoring(CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) [result] = yield c.gather(L) assert isinstance(result, int) assert result == expected_result yield n._close()