def test_nanny_process_failure(loop): c = Center('127.0.0.1', 8036) n = Nanny('127.0.0.1', 8037, 8038, '127.0.0.1', 8036, ncores=2) c.listen(c.port) @gen.coroutine def f(): nn = rpc(ip=n.ip, port=n.port) yield n._start() ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data={'x': 1, 'y': 2}) with ignoring(StreamClosedError): yield ww.compute(function=sys.exit, args=(0,), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores: yield gen.sleep(0.01) assert time() - start < 2 yield n._close() c.stop() loop.run_sync(f)
def test_fast_kill(loop): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') e = Executor((c.ip, c.port), start=False, loop=loop) @gen.coroutine def f(): yield a._start() yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) yield e._start() L = e.map(sleep, range(10)) try: start = time() yield e._restart() assert time() - start < 5 assert all(x.status == 'cancelled' for x in L) x = e.submit(inc, 1) result = yield x._result() assert result == 2 finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop() loop.run_sync(f)
def g(): c = Center('127.0.0.1') c.listen(0) a = Worker(c.ip, c.port, ncores=2, ip='127.0.0.1') yield a._start() b = Worker(c.ip, c.port, ncores=1, ip=b_ip) yield b._start() start = time() try: while len(c.ncores) < 2: yield gen.sleep(0.01) if time() - start > 5: raise Exception("Cluster creation timeout") yield f(c, a, b) except Exception as e: logger.exception(e) raise finally: logger.debug("Closing out test cluster") for w in [a, b]: with ignoring(TimeoutError, StreamClosedError, OSError): yield w._close() if os.path.exists(w.local_dir): shutil.rmtree(w.local_dir) c.stop()
def test_fast_kill(loop): from distributed import Nanny, rpc c = Center('127.0.0.1', 8006) a = Nanny('127.0.0.1', 8007, 8008, '127.0.0.1', 8006, ncores=2) b = Nanny('127.0.0.1', 8009, 8010, '127.0.0.1', 8006, ncores=2) e = Executor((c.ip, c.port), start=False, loop=loop) c.listen(c.port) @gen.coroutine def f(): yield a._start() yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) yield e._start() L = e.map(sleep, range(10)) try: start = time() yield e._restart() assert time() - start < 5 assert all(x.status == 'cancelled' for x in L) x = e.submit(inc, 1) result = yield x._result() assert result == 2 finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop() loop.run_sync(f)
def test_monitor_resources(loop): c = Center('127.0.0.1', 8026) n = Nanny('127.0.0.1', 8027, 8028, '127.0.0.1', 8026, ncores=2) c.listen(c.port) @gen.coroutine def f(): nn = rpc(ip=n.ip, port=n.port) yield n._start() assert n.process.is_alive() d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert isinstance(d['timestamp'], datetime) stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) stream.close() yield n._close() c.stop() loop.run_sync(f)
def run_center(port): from distributed import Center from tornado.ioloop import IOLoop import logging logging.getLogger("tornado").setLevel(logging.CRITICAL) center = Center('127.0.0.1', port) center.listen(port) IOLoop.current().start() IOLoop.current().close() # Never reached. TODO: clean shutdown of IOLoop
def main(host, port): if host is None: host = get_ip() logger.info("Start center at %s:%d", host, port) center = Center(host) center.listen(port) IOLoop.current().start() IOLoop.current().close() logger.info("\nEnd center at %s:%d", host, port)
def run_center(port): from distributed import Center from tornado.ioloop import IOLoop, PeriodicCallback import logging IOLoop.clear_instance() loop = IOLoop(); loop.make_current() PeriodicCallback(lambda: None, 500).start() logging.getLogger("tornado").setLevel(logging.CRITICAL) center = Center('127.0.0.1', port) center.listen(port) loop.start()
def test_errors_dont_block(): c = Center('127.0.0.1', 8017) w = Worker('127.0.0.2', 8018, c.ip, c.port, ncores=1) e = Executor((c.ip, c.port), start=False) @gen.coroutine def f(): c.listen(c.port) yield w._start() IOLoop.current().spawn_callback(e._go) L = [ e.submit(inc, 1), e.submit(throws, 1), e.submit(inc, 2), e.submit(throws, 2) ] i = 0 while not (L[0].status == L[2].status == 'finished'): i += 1 if i == 1000: assert False yield gen.sleep(0.01) result = yield e._gather([L[0], L[2]]) assert result == [2, 3] yield w._close() c.stop() IOLoop.current().run_sync(f)
def test_restart(loop): from distributed import Nanny, rpc c = Center('127.0.0.1', 8006) a = Nanny('127.0.0.1', 8007, 8008, '127.0.0.1', 8006, ncores=2) b = Nanny('127.0.0.1', 8009, 8010, '127.0.0.1', 8006, ncores=2) c.listen(c.port) @gen.coroutine def f(): yield a._start() yield b._start() e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2} x = e.submit(inc, 1) y = e.submit(inc, x) yield y._result() cc = rpc(ip=c.ip, port=c.port) who_has = yield cc.who_has() try: assert e.scheduler.who_has == who_has assert set(e.scheduler.who_has) == {x.key, y.key} f = yield e._restart() assert f is e assert len(e.scheduler.stacks) == 2 assert len(e.scheduler.processing) == 2 who_has = yield cc.who_has() assert not who_has assert not e.scheduler.who_has assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop() loop.run_sync(f)
def g(): c = Center('127.0.0.1', 8017) c.listen(c.port) a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1) yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: yield f(c, a, b) finally: with ignoring(): yield a._close() with ignoring(): yield b._close() c.stop()
def test_restart(): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield [a._start(), b._start()] e = Executor((c.ip, c.port), start=False, loop=IOLoop.current()) yield e._start() assert e.scheduler.ncores == {a.worker_address: 2, b.worker_address: 2} x = e.submit(inc, 1) y = e.submit(inc, x) yield y._result() cc = rpc(ip=c.ip, port=c.port) who_has = yield cc.who_has() try: assert e.scheduler.who_has == who_has assert set(e.scheduler.who_has) == {x.key, y.key} f = yield e._restart() assert f is e assert len(e.scheduler.stacks) == 2 assert len(e.scheduler.processing) == 2 who_has = yield cc.who_has() assert not who_has assert not e.scheduler.who_has assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e._shutdown(fast=True) c.stop()
def run_center(q): from distributed import Center from tornado.ioloop import IOLoop, PeriodicCallback import logging IOLoop.clear_instance() loop = IOLoop(); loop.make_current() PeriodicCallback(lambda: None, 500).start() logging.getLogger("tornado").setLevel(logging.CRITICAL) center = Center('127.0.0.1') while True: try: center.listen(0) break except Exception as e: logging.info("Could not start center on port. Retrying", exc_info=True) q.put(center.port) loop.start()
def test_sync_interactively(): c = Center('127.0.0.1', 8017, start=True, block=False) a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=1, start=True, block=False) b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1, start=True, block=False) try: while len(c.ncores) < 2: sleep(0.01) values = [1, 2, 3, 4, 5, 6, 7, 8] data = sync(scatter_to_center(c.ip, c.port, values)) assert merge(a.data, b.data) == {d.key: v for d, v in zip(data, values)} results = sync(collect_from_center(c.ip, c.port, data)) assert results == values finally: a.close() b.close() c.close()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == {('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2} assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center(ip='127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() d = n.resource_collect() assert {'cpu_percent', 'memory_percent'}.issubset(d) assert isinstance(d['timestamp'], datetime) stream = yield connect(ip=n.ip, port=n.port) yield write(stream, {'op': 'monitor_resources', 'interval': 0.01}) for i in range(3): msg = yield read(stream) assert isinstance(msg, dict) assert {'cpu_percent', 'memory_percent'}.issubset(msg) stream.close() yield n._close() c.stop()
def test_nanny(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start(0) nn = rpc(ip=n.ip, port=n.port) assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.worker_services assert not n.process yield nn.instantiate() assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.worker_services[n.worker_address]['nanny'] > 1024 yield nn.terminate() assert not n.process if n.process: n.process.terminate() yield n._close() c.stop()
def test_scatter_delete(): c = Center('127.0.0.1', 8017, loop=loop) a = Worker('127.0.0.1', 8018, c.ip, c.port, loop=loop, ncores=1) b = Worker('127.0.0.1', 8019, c.ip, c.port, loop=loop, ncores=1) @asyncio.coroutine def f(): while len(c.ncores) < 2: yield from asyncio.sleep(0.01, loop=loop) data = yield from scatter_to_center(c.ip, c.port, [1, 2, 3], loop=loop) assert merge(a.data, b.data) == \ {d.key: i for d, i in zip(data, [1, 2, 3])} assert set(c.who_has) == {d.key for d in data} assert all(len(v) == 1 for v in c.who_has.values()) assert [d.get() for d in data] == [1, 2, 3] yield from data[0]._delete() assert merge(a.data, b.data) == \ {d.key: i for d, i in zip(data[1:], [2, 3])} assert data[0].key not in c.who_has data = yield from scatter_to_workers(c.ip, c.port, [a.address, b.address], [4, 5, 6], loop=loop) m = merge(a.data, b.data) for d, v in zip(data, [4, 5, 6]): assert m[d.key] == v yield from a._close() yield from b._close() yield from c._close() loop.run_until_complete(asyncio.gather(c.go(), a.go(), b.go(), f()))
def test_multiple_executors_restart(loop): from distributed import Nanny, rpc c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') @gen.coroutine def f(): yield a._start() yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: e1 = Executor((c.ip, c.port), start=False, loop=loop) yield e1._start() e2 = Executor(e1.scheduler, start=False, loop=loop) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True) c.stop() loop.run_sync(f)
def test_multiple_executors_restart(loop): from distributed import Nanny, rpc c = Center('127.0.0.1', 8006) a = Nanny('127.0.0.1', 8007, 8008, '127.0.0.1', 8006, ncores=2) b = Nanny('127.0.0.1', 8009, 8010, '127.0.0.1', 8006, ncores=2) c.listen(c.port) @gen.coroutine def f(): yield a._start() yield b._start() while len(c.ncores) < 2: yield gen.sleep(0.01) try: e1 = Executor((c.ip, c.port), start=False, loop=loop) yield e1._start() e2 = Executor(scheduler=e1.scheduler, start=False, loop=loop) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() finally: yield a._close() yield b._close() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True) c.stop() loop.run_sync(f)
def test_nanny(loop): c = Center('127.0.0.1', 8026) n = Nanny('127.0.0.1', 8027, 8028, '127.0.0.1', 8026, ncores=2) c.listen(c.port) @gen.coroutine def f(): nn = rpc(ip=n.ip, port=n.port) yield n._start() assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.nannies[n.worker_address] > 8000 yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.nannies assert not n.process yield nn.kill() assert n.worker_address not in c.ncores assert n.worker_address not in c.nannies assert not n.process yield nn.instantiate() assert n.process.is_alive() assert c.ncores[n.worker_address] == 2 assert c.nannies[n.worker_address] > 8000 yield nn.terminate() assert not n.process if n.process: n.process.terminate() yield n._close() c.stop() loop.run_sync(f)
def g(): c = Center("127.0.0.1", 8017) c.listen(c.port) a = Worker("127.0.0.2", 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker("127.0.0.3", 8019, c.ip, c.port, ncores=1) yield b._start() start = time() try: while len(c.ncores) < 2: yield gen.sleep(0.01) if time() - start > 5: raise Exception("Cluster creation timeout") yield f(c, a, b) finally: logger.debug("Closing out test cluster") with ignoring(): yield a._close() with ignoring(): yield b._close() c.stop()
def test_monitor_resources(loop): c = Center('127.0.0.1', 8026) a = Nanny('127.0.0.1', 8027, 8028, '127.0.0.1', 8026, ncores=2) b = Nanny('127.0.0.1', 8029, 8030, '127.0.0.1', 8026, ncores=2) c.listen(c.port) s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) @gen.coroutine def f(): yield a._start() yield b._start() yield s._sync_center() done = s.start() try: assert s.ncores == {('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2} assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {(a.ip, a.port), (b.ip, b.port)} assert all(len(v) == 3 for v in s.resource_logs.values()) s.put({'op': 'close'}) yield done finally: yield a._close() yield b._close() c.stop() loop.run_sync(f, timeout=10)
def g(): c = Center('127.0.0.1', 8017) c.listen(c.port) a = Worker('127.0.0.2', 8018, c.ip, c.port, ncores=2) yield a._start() b = Worker('127.0.0.3', 8019, c.ip, c.port, ncores=1) yield b._start() start = time() try: while len(c.ncores) < 2: yield gen.sleep(0.01) if time() - start > 5: raise Exception("Cluster creation timeout") yield f(c, a, b) finally: logger.debug("Closing out test cluster") for w in [a, b]: with ignoring(): yield w._close() if os.path.exists(w.local_dir): shutil.rmtree(w.local_dir) c.stop()
def test_errors_dont_block(): c = Center('127.0.0.1') c.listen(0) w = Worker(c.ip, c.port, ncores=1, ip='127.0.0.1') e = Executor((c.ip, c.port), start=False, loop=IOLoop.current()) yield w._start() yield e._start() L = [e.submit(inc, 1), e.submit(throws, 1), e.submit(inc, 2), e.submit(throws, 2)] start = time() while not (L[0].status == L[2].status == 'finished'): assert time() < start + 5 yield gen.sleep(0.01) result = yield e._gather([L[0], L[2]]) assert result == [2, 3] yield w._close() c.stop()
def test_monitor_resources(): pytest.importorskip('psutil') c = Center('127.0.0.1') c.listen(0) a = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') b = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') s = Scheduler((c.ip, c.port), resource_interval=0.01, resource_log_size=3) yield a._start() yield b._start() yield s.sync_center() done = s.start() try: assert s.ncores == { ('127.0.0.1', a.worker_port): 2, ('127.0.0.1', b.worker_port): 2 } assert s.nannies == {(n.ip, n.worker_port): n.port for n in [a, b]} while any(len(v) < 3 for v in s.resource_logs.values()): yield gen.sleep(0.01) yield gen.sleep(0.1) assert set(s.resource_logs) == {a.address, b.address} assert all(len(v) == 3 for v in s.resource_logs.values()) d = s.diagnostic_resources(n=2) assert set(d) == {a.worker_address, b.worker_address} assert set(d[a.worker_address]).issubset({'cpu', 'memory', 'time'}) assert all(len(v) == 2 for v in d[a.worker_address].values()) s.put({'op': 'close'}) yield done finally: with ignoring(TimeoutError, StreamClosedError, OSError): yield a._close(timeout=0.5) with ignoring(TimeoutError, StreamClosedError, OSError): yield b._close(timeout=0.5) c.stop()
def test_nanny_process_failure(): c = Center('127.0.0.1') c.listen(0) n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1') yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0,)), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in c.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() c.stop()