Пример #1
0
def test_broken_worker_during_computation(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Пример #2
0
def test_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        while True:
            line = proc.stderr.readline()
            if b'Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                for name in [socket.gethostname(), 'localhost', '127.0.0.1', get_ip()]:
                    response = requests.get('http://%s:8787/status/' % name)
                    assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #3
0
def test_nanny_worker_ports(loop):
    try:
        worker = Popen(['dworker', '127.0.0.1:8989', '--host', '127.0.0.1',
                        '--worker-port', '8788', '--nanny-port', '8789'],
                        stdout=PIPE, stderr=PIPE)
        sched = Popen(['dscheduler', '--port', '8989'], stdout=PIPE, stderr=PIPE)
        with Executor('127.0.0.1:8989', loop=loop) as e:
            start = time()
            while True:
                d = sync(e.loop, e.scheduler.identity)
                if d['workers']:
                    break
                else:
                    assert time() - start < 5
                    sleep(0.1)
            assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789
    finally:
        with ignoring(Exception):
            w = rpc('127.0.0.1:8789')
            sync(loop, w.terminate)

        with ignoring(Exception):
            os.kill(sched.pid, signal.SIGINT)

        with ignoring(Exception):
            worker.kill()
Пример #4
0
def test_bokeh_non_standard_ports():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler',
                      '--port', '3448',
                      '--http-port', '4824',
                      '--bokeh-port', '4832'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:3448')

        while True:
            line = proc.stderr.readline()
            if b'Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                response = requests.get('http://localhost:4832/status/')
                assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #5
0
def cluster(nworkers=2):
    _port[0] += 1
    cport = _port[0]
    center = Process(target=run_center, args=(cport,))
    workers = []
    for i in range(nworkers):
        _port[0] += 1
        port = _port[0]
        proc = Process(target=run_worker, args=(port, cport), kwargs={'ncores': 1})
        workers.append({'port': port, 'proc': proc})

    center.start()
    for worker in workers:
        worker['proc'].start()

    sock = connect_sync('127.0.0.1', cport)
    while True:
        write_sync(sock, {'op': 'ncores'})
        ncores = read_sync(sock)
        if len(ncores) == nworkers:
            break

    try:
        yield {'proc': center, 'port': cport}, workers
    finally:
        for port in [cport] + [w['port'] for w in workers]:
            with ignoring(socket.error):
                sock = connect_sync('127.0.0.1', port)
                write_sync(sock, dict(op='terminate', close=True))
                response = read_sync(sock)
                sock.close()
        for proc in [center] + [w['proc'] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
Пример #6
0
def cluster(nworkers=2):
    _port[0] += 1
    cport = _port[0]
    center = Process(target=run_center, args=(cport,))
    workers = []
    for i in range(nworkers):
        _port[0] += 1
        port = _port[0]
        proc = Process(target=run_worker, args=(port, cport), kwargs={"ncores": 1})
        workers.append({"port": port, "proc": proc})

    center.start()
    for worker in workers:
        worker["proc"].start()

    sock = connect_sync("127.0.0.1", cport)
    while True:
        write_sync(sock, {"op": "ncores"})
        ncores = read_sync(sock)
        if len(ncores) == nworkers:
            break

    try:
        yield {"proc": center, "port": cport}, workers
    finally:
        for port in [cport] + [w["port"] for w in workers]:
            with ignoring(socket.error):
                sock = connect_sync("127.0.0.1", port)
                write_sync(sock, dict(op="terminate", close=True))
                response = read_sync(sock)
                sock.close()
        for proc in [center] + [w["proc"] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
Пример #7
0
def test_broken_worker_during_computation(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Пример #8
0
def test_nanny_worker_ports(loop):
    try:
        worker = Popen([
            'dworker', '127.0.0.1:8989', '--host', '127.0.0.1',
            '--worker-port', '8788', '--nanny-port', '8789'
        ],
                       stdout=PIPE,
                       stderr=PIPE)
        sched = Popen(['dscheduler', '--port', '8989'],
                      stdout=PIPE,
                      stderr=PIPE)
        with Executor('127.0.0.1:8989', loop=loop) as e:
            start = time()
            while True:
                d = sync(e.loop, e.scheduler.identity)
                if d['workers']:
                    break
                else:
                    assert time() - start < 5
                    sleep(0.1)
            assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789
    finally:
        with ignoring(Exception):
            w = rpc('127.0.0.1:8789')
            sync(loop, w.terminate)

        with ignoring(Exception):
            os.kill(sched.pid, signal.SIGINT)

        with ignoring(Exception):
            worker.kill()
Пример #9
0
def test_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        while True:
            line = proc.stderr.readline()
            if b'Start Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                for name in [
                        socket.gethostname(), 'localhost', '127.0.0.1',
                        get_ip()
                ]:
                    response = requests.get('http://%s:8787/status/' % name)
                    assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #10
0
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])
    yield gen.sleep(random() / 2)
    with ignoring(
            CommClosedError,
            EnvironmentError):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Пример #11
0
def test_no_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)
        for i in range(3):
            assert b'bokeh' not in next(proc.stderr)
    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #12
0
def test_no_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)
        for i in range(3):
            assert b'bokeh' not in next(proc.stderr)
    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #13
0
async def start_scheduler(gateway, security, exit_on_failure=True):
    loop = IOLoop.current()
    plugin = GatewaySchedulerPlugin(gateway, loop)

    services = {("gateway", 0): (GatewaySchedulerService, {"plugin": plugin})}
    dashboard = False
    with ignoring(ImportError):
        from distributed.dashboard.scheduler import BokehScheduler

        services[("dashboard", 0)] = (BokehScheduler, {})
        dashboard = True

    scheduler = Scheduler(loop=loop, services=services, security=security)
    scheduler.add_plugin(plugin)
    await scheduler

    host = urlparse(scheduler.address).hostname
    gateway_port = scheduler.services["gateway"].port
    api_address = "http://%s:%d" % (host, gateway_port)

    if dashboard:
        dashboard_port = scheduler.services["dashboard"].port
        dashboard_address = "http://%s:%d" % (host, dashboard_port)
    else:
        dashboard_address = ""

    try:
        await gateway.send_addresses(scheduler.address, dashboard_address,
                                     api_address)
    except Exception as exc:
        logger.error("Failed to send addresses to gateway", exc_info=exc)
        if exit_on_failure:
            sys.exit(1)

    return scheduler
Пример #14
0
    def f():
        nn = rpc(ip=n.ip, port=n.port)
        yield n._start()

        ww = rpc(ip=n.ip, port=n.worker_port)
        yield ww.update_data(data={'x': 1, 'y': 2})
        with ignoring(StreamClosedError):
            yield ww.compute(function=sys.exit, args=(0,), key='z')

        start = time()
        while n.process.is_alive():  # wait while process dies
            yield gen.sleep(0.01)
            assert time() - start < 2

        start = time()
        while not n.process.is_alive():  # wait while process comes back
            yield gen.sleep(0.01)
            assert time() - start < 2

        start = time()
        while n.worker_address not in c.ncores:
            yield gen.sleep(0.01)
            assert time() - start < 2

        yield n._close()
        c.stop()
Пример #15
0
    def adapt(self, **kwargs):
        """ Turn on adaptivity

        For keyword arguments see dask_drmaa.adaptive.Adaptive

        Examples
        --------
        >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')

        See Also
        --------
        Cluster: an interface for other clusters to inherit from
        """
        from .adaptive import Adaptive

        with ignoring(AttributeError):
            self._adaptive.stop()
        if not hasattr(self, '_adaptive_options'):
            self._adaptive_options = {}

        self._adaptive_options.update(kwargs)
        self._adaptive = Adaptive(
            self, self.scheduler, **self._adaptive_options
        )

        return self._adaptive
def test_worker_who_has_clears_after_failed_connection(c, s, a, b):
    n = yield Nanny(s.address, nthreads=2, loop=s.loop)

    start = time()
    while len(s.nthreads) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    futures = c.map(slowinc, range(20), delay=0.01, key=["f%d" % i for i in range(20)])
    yield wait(futures)

    result = yield c.submit(sum, futures, workers=a.address)
    for dep in set(a.dep_state) - set(a.task_state):
        a.release_dep(dep, report=True)

    n_worker_address = n.worker_address
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[n_worker_address])

    while len(s.workers) > 2:
        yield gen.sleep(0.01)

    total = c.submit(sum, futures, workers=a.address)
    yield total

    assert not a.has_what.get(n_worker_address)
    assert not any(n_worker_address in s for s in a.who_has.values())

    yield n.close()
Пример #17
0
 def scale_cb(b):
     with log_errors():
         n = request.value
         with ignoring(AttributeError):
             self._adaptive.stop()
         self.scale(n)
         update()
Пример #18
0
def test_failed_worker_without_warning(c, s, a, b):
    L = c.map(inc, range(10))
    yield wait(L)

    original_pid = a.pid
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[a.worker_address])
    start = time()
    while a.pid == original_pid:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield gen.sleep(0.5)

    start = time()
    while len(s.ncores) < 2:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield wait(L)

    L2 = c.map(inc, range(10, 20))
    yield wait(L2)
    assert all(len(keys) > 0 for keys in s.has_what.values())
    ncores2 = dict(s.ncores)

    yield c._restart()

    L = c.map(inc, range(10))
    yield wait(L)
    assert all(len(keys) > 0 for keys in s.has_what.values())

    assert not (set(ncores2) & set(s.ncores))  # no overlap
Пример #19
0
    def f():
        nn = rpc(ip=n.ip, port=n.port)
        yield n._start()
        first_dir = n.worker_dir

        assert os.path.exists(first_dir)

        ww = rpc(ip=n.ip, port=n.worker_port)
        yield ww.update_data(data={'x': 1, 'y': 2})
        with ignoring(StreamClosedError):
            yield ww.compute(function=sys.exit, args=(0,), key='z')

        start = time()
        while n.process.is_alive():  # wait while process dies
            yield gen.sleep(0.01)
            assert time() - start < 2

        start = time()
        while not n.process.is_alive():  # wait while process comes back
            yield gen.sleep(0.01)
            assert time() - start < 2

        start = time()
        while n.worker_address not in c.ncores or n.worker_dir is None:
            yield gen.sleep(0.01)
            assert time() - start < 2

        second_dir = n.worker_dir

        yield n._close()
        assert not os.path.exists(second_dir)
        assert not os.path.exists(first_dir)
        assert first_dir != n.worker_dir
        c.stop()
Пример #20
0
def test_worker_who_has_clears_after_failed_connection(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    futures = c.map(slowinc, range(20), delay=0.01,
                    key=['f%d' % i for i in range(20)])
    yield wait(futures)

    result = yield c.submit(sum, futures, workers=a.address)
    for dep in set(a.dep_state) - set(a.task_state):
        a.release_dep(dep, report=True)

    n_worker_address = n.worker_address
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[n_worker_address])

    while len(s.workers) > 2:
        yield gen.sleep(0.01)

    total = c.submit(sum, futures, workers=a.address)
    yield total

    assert not a.has_what.get(n_worker_address)
    assert not any(n_worker_address in s for s in a.who_has.values())

    yield n._close()
Пример #21
0
def test_bokeh(loop):
    from distributed.http import HTTPScheduler
    import requests
    with LocalCluster(
            scheduler_port=0,
            silence_logs=False,
            loop=loop,
            diagnostics_port=4724,
            services={('http', 0): HTTPScheduler},
    ) as c:
        start = time()
        while True:
            with ignoring(Exception):
                response = requests.get('http://127.0.0.1:%d/status/' %
                                        c.diagnostics.port)
                if response.ok:
                    break
            assert time() < start + 20
            sleep(0.01)

    start = time()
    while not raises(
            lambda: requests.get('http://127.0.0.1:%d/status/' % 4724)):
        assert time() < start + 10
        sleep(0.01)
def test_failed_worker_without_warning(c, s, a, b):
    L = c.map(inc, range(10))
    yield wait(L)

    original_pid = a.pid
    with ignoring(CommClosedError):
        yield c._run(os._exit, 1, workers=[a.worker_address])
    start = time()
    while a.pid == original_pid:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield gen.sleep(0.5)

    start = time()
    while len(s.nthreads) < 2:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield wait(L)

    L2 = c.map(inc, range(10, 20))
    yield wait(L2)
    assert all(len(keys) > 0 for keys in s.has_what.values())
    nthreads2 = dict(s.nthreads)

    yield c.restart()

    L = c.map(inc, range(10))
    yield wait(L)
    assert all(len(keys) > 0 for keys in s.has_what.values())

    assert not (set(nthreads2) & set(s.nthreads))  # no overlap
Пример #23
0
async def start_scheduler(
    gateway,
    security,
    adaptive_period=3,
    heartbeat_period=15,
    idle_timeout=0,
    exit_on_failure=True,
):
    loop = IOLoop.current()
    services = {
        ("gateway", 0): (
            GatewaySchedulerService,
            {
                "gateway": gateway,
                "adaptive_period": adaptive_period,
                "heartbeat_period": heartbeat_period,
                "idle_timeout": idle_timeout,
            },
        )
    }
    with ignoring(ImportError):
        from distributed.dashboard.scheduler import BokehScheduler

        services[("dashboard", 0)] = (BokehScheduler, {})

    scheduler = Scheduler(loop=loop, services=services, security=security)
    return await scheduler
Пример #24
0
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh,
         bokeh_whitelist, prefix, use_xheaders, pid_file):

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    addr = uri_from_host_port(host, port, 8786)

    loop = IOLoop.current()
    logger.info('-' * 47)

    services = {('http', http_port): HTTPScheduler}
    if _bokeh:
        with ignoring(ImportError):
            from distributed.bokeh.scheduler import BokehScheduler
            services[('bokeh', bokeh_internal_port)] = BokehScheduler
    scheduler = Scheduler(loop=loop, services=services)
    scheduler.start(addr)

    bokeh_proc = None
    if _bokeh:
        try:
            from distributed.bokeh.application import BokehWebInterface
            bokeh_proc = BokehWebInterface(http_port=http_port,
                                           tcp_port=scheduler.port,
                                           bokeh_port=bokeh_port,
                                           bokeh_whitelist=bokeh_whitelist,
                                           show=show,
                                           prefix=prefix,
                                           use_xheaders=use_xheaders,
                                           quiet=False)
        except ImportError:
            logger.info("Please install Bokeh to get Web UI")
        except Exception as e:
            logger.warn("Could not start Bokeh web UI", exc_info=True)

    logger.info('-' * 47)
    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
        if bokeh_proc:
            bokeh_proc.close()

        logger.info("End scheduler at %r", addr)
Пример #25
0
 def request_cb(b):
     with log_errors():
         arg = request.value
         with ignoring(AttributeError):
             self._adaptive.stop()
         local_kwargs = dict()
         local_kwargs[kwarg] = arg
         self.scale(**local_kwargs)
Пример #26
0
def cluster(nworkers=2, nanny=False):
    if nanny:
        _run_worker = run_nanny
    else:
        _run_worker = run_worker
    _port[0] += 1
    cport = _port[0]
    center = Process(target=run_center, args=(cport,))
    workers = []
    for i in range(nworkers):
        _port[0] += 1
        port = _port[0]
        proc = Process(target=_run_worker, args=(port, cport),
                        kwargs={'ncores': 1, 'local_dir': '_test_worker-%d' % port})
        workers.append({'port': port, 'proc': proc})

    center.start()
    for worker in workers:
        worker['proc'].start()

    sock = connect_sync('127.0.0.1', cport)
    start = time()
    try:
        while True:
            write_sync(sock, {'op': 'ncores'})
            ncores = read_sync(sock)
            if len(ncores) == nworkers:
                break
            if time() - start > 5:
                raise Exception("Timeout on cluster creation")

        yield {'proc': center, 'port': cport}, workers
    finally:
        logger.debug("Closing out test cluster")
        for port in [cport] + [w['port'] for w in workers]:
            with ignoring(socket.error):
                sock = connect_sync('127.0.0.1', port)
                write_sync(sock, dict(op='terminate', close=True))
                response = read_sync(sock)
                sock.close()
        for proc in [center] + [w['proc'] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
        for fn in glob('_test_worker-*'):
            shutil.rmtree(fn)
Пример #27
0
 def stop_workers(self, workers):
     if not workers:
         return
     workers = list(map(int, workers))
     jobs = [self.jobs[w] for w in workers]
     self._call([self._cancelcmd] + list(jobs))
     for w in workers:
         with ignoring(KeyError):
             del self.jobs[w]
Пример #28
0
 def stop_workers(self, workers):
     """ Stop a list of workers"""
     if not workers:
         return
     workers = list(map(int, workers))
     jobs = [self.jobs[w] for w in workers]
     self._call([self.cancel_command] + list(jobs))
     for w in workers:
         with ignoring(KeyError):
             del self.jobs[w]
Пример #29
0
    def g():
        c = Center('127.0.0.1', 8017)
        c.listen(c.port)
        a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=1)
        yield a._start()
        b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1)
        yield b._start()

        while len(c.ncores) < 2:
            yield gen.sleep(0.01)

        try:
            yield f(c, a, b)
        finally:
            with ignoring(Exception):
                yield a._close()
            with ignoring(Exception):
                yield b._close()
            c.stop()
Пример #30
0
    def g():
        c = Center('127.0.0.1', 8017)
        c.listen(c.port)
        a = Worker('127.0.0.1', 8018, c.ip, c.port, ncores=2)
        yield a._start()
        b = Worker('127.0.0.1', 8019, c.ip, c.port, ncores=1)
        yield b._start()

        while len(c.ncores) < 2:
            yield gen.sleep(0.01)

        try:
            yield f(c, a, b)
        finally:
            with ignoring():
                yield a._close()
            with ignoring():
                yield b._close()
            c.stop()
Пример #31
0
def cluster(nworkers=2, nanny=False):
    if nanny:
        _run_worker = run_nanny
    else:
        _run_worker = run_worker
    _port[0] += 1
    cport = _port[0]
    center = Process(target=run_center, args=(cport,))
    workers = []
    for i in range(nworkers):
        _port[0] += 1
        port = _port[0]
        proc = Process(target=_run_worker, args=(port, cport), kwargs={"ncores": 1})
        workers.append({"port": port, "proc": proc})

    center.start()
    for worker in workers:
        worker["proc"].start()

    sock = connect_sync("127.0.0.1", cport)
    start = time()
    try:
        while True:
            write_sync(sock, {"op": "ncores"})
            ncores = read_sync(sock)
            if len(ncores) == nworkers:
                break
            if time() - start > 5:
                raise Exception("Timeout on cluster creation")

        yield {"proc": center, "port": cport}, workers
    finally:
        logger.debug("Closing out test cluster")
        for port in [cport] + [w["port"] for w in workers]:
            with ignoring(socket.error):
                sock = connect_sync("127.0.0.1", port)
                write_sync(sock, dict(op="terminate", close=True))
                response = read_sync(sock)
                sock.close()
        for proc in [center] + [w["proc"] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
Пример #32
0
def test_active_holds_tasks(e, s, w):
    future = e.submit(slowinc, 1, delay=0.2)
    yield gen.sleep(0.1)
    assert future.key in w.active
    yield future._result()
    assert future.key not in w.active

    future = e.submit(throws, 1)
    with ignoring(Exception):
        yield _wait([future])
    assert not w.active
Пример #33
0
def test_active_holds_tasks(e, s, w):
    future = e.submit(slowinc, 1, delay=0.2)
    yield gen.sleep(0.1)
    assert future.key in w.active
    yield future._result()
    assert future.key not in w.active

    future = e.submit(throws, 1)
    with ignoring(Exception):
        yield _wait([future])
    assert not w.active
Пример #34
0
    def _close(self):
        if self.status == 'closed':
            return

        logging.info('Stopping workers...')
        self.workers.close()

        with ignoring(gen.TimeoutError, CommClosedError, OSError):
            logging.info('Stopping scheduler...')
            yield self.scheduler.close(fast=True)

        self.status = 'closed'
Пример #35
0
    def adapt(self, **kwargs):

        with ignoring(AttributeError):
            self._adaptive.stop()

        if not hasattr(self, '_adaptive_options'):
            self._adaptive_options = {}

        self._adaptive_options.update(kwargs)
        self._adaptive = _ImprovedAdaptive(self.scheduler, self,
                                           **self._adaptive_options)
        return self._adaptive
Пример #36
0
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.address, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    N = 256
    expected_result = N * (N + 1) // 2
    i = 0
    L = c.map(inc, range(N), key=["inc-%d-%d" % (i, j) for j in range(N)])
    while len(L) > 1:
        i += 1
        L = c.map(
            slowadd,
            *zip(*partition_all(2, L)),
            key=["add-%d-%d" % (i, j) for j in range(len(L) // 2)]
        )

    yield gen.sleep(random.random() / 20)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])

    yield gen.sleep(random.random() / 20)
    while len(s.workers) < 3:
        yield gen.sleep(0.01)

    with ignoring(
        CommClosedError, EnvironmentError
    ):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    [result] = yield c.gather(L)
    assert isinstance(result, int)
    assert result == expected_result

    yield n.close()
Пример #37
0
def test_stress_scatter_death(c, s, *workers):
    import random

    s.allowed_failures = 1000
    np = pytest.importorskip("numpy")
    L = yield c.scatter([np.random.random(10000) for i in range(len(workers))])
    yield c.replicate(L, n=2)

    adds = [
        delayed(slowadd, pure=True)(
            random.choice(L),
            random.choice(L),
            delay=0.05,
            dask_key_name="slowadd-1-%d" % i,
        ) for i in range(50)
    ]

    adds = [
        delayed(slowadd, pure=True)(a,
                                    b,
                                    delay=0.02,
                                    dask_key_name="slowadd-2-%d" % i)
        for i, (a, b) in enumerate(sliding_window(2, adds))
    ]

    futures = c.compute(adds)
    L = adds = None

    alive = list(workers)

    from distributed.scheduler import logger

    for i in range(7):
        yield gen.sleep(0.1)
        try:
            s.validate_state()
        except Exception as c:
            logger.exception(c)
            if config.get("log-on-err"):
                import pdb

                pdb.set_trace()
            else:
                raise
        w = random.choice(alive)
        yield w.close()
        alive.remove(w)

    with ignoring(CancelledError):
        yield c.gather(futures)

    futures = None
Пример #38
0
    def g():
        c = Center("127.0.0.1", 8017)
        c.listen(c.port)
        a = Worker("127.0.0.2", 8018, c.ip, c.port, ncores=2)
        yield a._start()
        b = Worker("127.0.0.3", 8019, c.ip, c.port, ncores=1)
        yield b._start()

        start = time()
        try:
            while len(c.ncores) < 2:
                yield gen.sleep(0.01)
                if time() - start > 5:
                    raise Exception("Cluster creation timeout")

            yield f(c, a, b)
        finally:
            logger.debug("Closing out test cluster")
            with ignoring():
                yield a._close()
            with ignoring():
                yield b._close()
            c.stop()
Пример #39
0
 def adapt(self,
           minimum_cores=None,
           maximum_cores=None,
           minimum_memory=None,
           maximum_memory=None,
           **kwargs):
     """ Turn on adaptivity
     For keyword arguments see dask.distributed.Adaptive
     Instead of minimum and maximum parameters which apply to the number of
     worker, If Cluster object implements jobqueue_worker_spec attribute, one can
     use the following parameters:
     Parameters
     ----------
     minimum_cores: int
         Minimum number of cores for the cluster
     maximum_cores: int
         Maximum number of cores for the cluster
     minimum_memory: str
         Minimum amount of memory for the cluster
     maximum_memory: str
         Maximum amount of memory for the cluster
     Examples
     --------
     >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')
     >>> cluster.adapt(minimum_cores=24, maximum_cores=96)
     >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB')
     """
     with ignoring(AttributeError):
         self._adaptive.stop()
     if not hasattr(self, "_adaptive_options"):
         self._adaptive_options = {}
     if "minimum" not in kwargs:
         if minimum_cores is not None:
             kwargs["minimum"] = self._get_nb_workers_from_cores(
                 minimum_cores)
         elif minimum_memory is not None:
             kwargs["minimum"] = self._get_nb_workers_from_memory(
                 minimum_memory)
     if "maximum" not in kwargs:
         if maximum_cores is not None:
             kwargs["maximum"] = self._get_nb_workers_from_cores(
                 maximum_cores)
         elif maximum_memory is not None:
             kwargs["maximum"] = self._get_nb_workers_from_memory(
                 maximum_memory)
     self._adaptive_options.update(kwargs)
     self._adaptive = Adaptive(self.scheduler, self,
                               **self._adaptive_options)
     return self._adaptive
Пример #40
0
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    N = 256
    expected_result = N * (N + 1) // 2
    i = 0
    L = c.map(inc, range(N),
              key=['inc-%d-%d' % (i, j) for j in range(N)])
    while len(L) > 1:
        i += 1
        L = c.map(slowadd, *zip(*partition_all(2, L)),
                  key=['add-%d-%d' % (i, j) for j in range(len(L) // 2)])

    yield gen.sleep(random.random() / 20)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])

    yield gen.sleep(random.random() / 20)
    while len(s.workers) < 3:
        yield gen.sleep(0.01)

    with ignoring(CommClosedError, EnvironmentError):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    [result] = yield c.gather(L)
    assert isinstance(result, int)
    assert result == expected_result

    yield n._close()
Пример #41
0
def cluster(nworkers=2):
    _port[0] += 1
    cport = _port[0]
    center = Process(target=run_center, args=(cport, ))
    workers = []
    for i in range(nworkers):
        _port[0] += 1
        port = _port[0]
        proc = Process(target=run_worker,
                       args=(port, cport),
                       kwargs={'ncores': 1})
        workers.append({'port': port, 'proc': proc})

    center.start()
    for worker in workers:
        worker['proc'].start()

    sock = connect_sync('127.0.0.1', cport)
    while True:
        write_sync(sock, {'op': 'ncores'})
        ncores = read_sync(sock)
        if len(ncores) == nworkers:
            break

    try:
        yield {'proc': center, 'port': cport}, workers
    finally:
        for port in [cport] + [w['port'] for w in workers]:
            with ignoring(socket.error):
                sock = connect_sync('127.0.0.1', port)
                write_sync(sock, dict(op='terminate', close=True))
                response = read_sync(sock)
                sock.close()
        for proc in [center] + [w['proc'] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
Пример #42
0
def test_start_diagnostics(loop):
    from distributed.http import HTTPScheduler
    import requests
    with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop) as c:
        c.start_diagnostics_server(show=False, port=3748)

        start = time()
        while True:
            with ignoring(Exception):
                response = requests.get('http://127.0.0.1:%d/status/' %
                                        c.diagnostics.port)
                if response.ok:
                    break
            assert time() < start + 20
            sleep(0.01)
Пример #43
0
def scheduler():  # pragma: nocover
    app_client = skein.ApplicationClient.from_current()

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    addr = 'tcp://'

    loop = IOLoop.current()

    services = {}
    bokeh = False
    with ignoring(ImportError):
        try:
            from distributed.dashboard.scheduler import BokehScheduler
        except ImportError:
            # Old import location
            from distributed.bokeh.scheduler import BokehScheduler
        services[('bokeh', 0)] = (BokehScheduler, {})
        bokeh = True

    scheduler = Scheduler(loop=loop, services=services)
    scheduler.start(addr)

    install_signal_handlers(loop)

    # Set dask.dashboard before dask.scheduler since the YarnCluster object
    # waits on dask.scheduler only
    if bokeh:
        bokeh_port = scheduler.services['bokeh'].port
        bokeh_host = urlparse(scheduler.address).hostname
        bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port)

        app_client.kv['dask.dashboard'] = bokeh_address.encode()

    app_client.kv['dask.scheduler'] = scheduler.address.encode()

    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
Пример #44
0
def test_BokehWebInterface(loop):
    with LocalCluster(2, loop=loop, scheduler_port=0,
                      services={('http', 0): HTTPScheduler}) as c:
        w = BokehWebInterface(
                tcp_port=c.scheduler.port,
                http_port=c.scheduler.services['http'].port,
                bokeh_port=8787)

        start = time()
        while True:
            with ignoring(Exception):
                response = requests.get('http://127.0.0.1:8787/status/')
                if response.ok:
                    break
            assert time() < start + 5
            sleep(0.01)
Пример #45
0
def test_start_diagnostics(loop):
    pytest.importorskip('bokeh')
    from distributed.http import HTTPScheduler
    import requests
    with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop,
            diagnostics_port=None) as c:
        c.start_diagnostics_server(show=False, port=3748)

        start = time()
        while True:
            with ignoring(Exception):
                response = requests.get('http://127.0.0.1:%d/status/' %
                                        c.diagnostics.port)
                if response.ok:
                    break
            assert time() < start + 20
            sleep(0.01)
Пример #46
0
def test_bokeh(loop):
    from distributed.http import HTTPScheduler
    import requests
    with LocalCluster(scheduler_port=0, silence_logs=False, loop=loop,
            diagnostics_port=4724, services={('http', 0): HTTPScheduler}) as c:
        start = time()
        while True:
            with ignoring(Exception):
                response = requests.get('http://127.0.0.1:%d/status/' %
                                        c.diagnostics.port)
                if response.ok:
                    break
            assert time() < start + 20
            sleep(0.01)

    start = time()
    while not raises(lambda: requests.get('http://127.0.0.1:%d/status/' % 4724)):
        assert time() < start + 10
        sleep(0.01)
Пример #47
0
def main():
    app_client = skein.ApplicationClient.from_current()

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    addr = uri_from_host_port('', None, 0)

    loop = IOLoop.current()

    services = {}
    bokeh = False
    with ignoring(ImportError):
        from distributed.bokeh.scheduler import BokehScheduler
        services[('bokeh', 0)] = (BokehScheduler, {})
        bokeh = True

    scheduler = Scheduler(loop=loop, services=services)
    scheduler.start(addr)

    install_signal_handlers(loop)

    app_client.kv['dask.scheduler'] = scheduler.address.encode()

    if bokeh:
        bokeh_port = scheduler.services['bokeh'].port
        bokeh_host = urlparse(scheduler.address).hostname
        bokeh_address = 'http://%s:%d' % (bokeh_host, bokeh_port)

        app_client.kv['dask.dashboard'] = bokeh_address.encode()

    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
Пример #48
0
def test_BokehWebInterface(loop):
    with LocalCluster(2, loop=loop, scheduler_port=0,
                      services={('http', 0): HTTPScheduler},
                      diagnostics_port=None) as c:
        with pytest.raises(Exception):
            response = requests.get('http://127.0.0.1:8787/status/')
        with BokehWebInterface(
                scheduler_address=c.scheduler.address,
                http_port=c.scheduler.services['http'].port,
                bokeh_port=8787) as w:
            start = time()
            while True:
                with ignoring(Exception):
                    response = requests.get('http://127.0.0.1:8787/status/')
                    if response.ok:
                        break
                assert time() < start + 5
                sleep(0.01)
    with pytest.raises(Exception):
        response = requests.get('http://127.0.0.1:8787/status/')
Пример #49
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_address = n.worker_address
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    pid = n.pid
    assert pid is not None
    with ignoring(CommClosedError):
        yield c._run(os._exit, 0, workers=[n.worker_address])

    start = time()
    while n.pid == pid:  # wait while process dies and comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    # assert n.worker_address != original_address  # most likely

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Пример #50
0
def test_nanny_process_failure():
    c = Center('127.0.0.1')
    c.listen(0)
    n = Nanny(c.ip, c.port, ncores=2, ip='127.0.0.1')
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit),
                         args=dumps((0,)),
                         key='z')

    start = time()
    while n.process.is_alive():  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while not n.process.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while n.worker_address not in c.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 2

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    c.stop()
Пример #51
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_address = n.worker_address
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    pid = n.pid
    assert pid is not None
    with ignoring(CommClosedError):
        yield c._run(os._exit, 0, workers=[n.worker_address])

    start = time()
    while n.pid == pid:  # wait while process dies and comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    # assert n.worker_address != original_address  # most likely

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Пример #52
0
def test_nanny_process_failure(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit),
                         args=dumps((0,)),
                         key='z')

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not n.process.poll() is None:  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    s.stop()
Пример #53
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield c._run(sys.exit, 0, workers=[n.worker_address])

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Пример #54
0
    def g():
        c = Center('127.0.0.1', 8017)
        c.listen(c.port)
        a = Worker('127.0.0.2', 8018, c.ip, c.port, ncores=2)
        yield a._start()
        b = Worker('127.0.0.3', 8019, c.ip, c.port, ncores=1)
        yield b._start()

        start = time()
        try:
            while len(c.ncores) < 2:
                yield gen.sleep(0.01)
                if time() - start > 5:
                    raise Exception("Cluster creation timeout")

            yield f(c, a, b)
        finally:
            logger.debug("Closing out test cluster")
            for w in [a, b]:
                with ignoring(TimeoutError, StreamClosedError):
                    yield w._close()
                if os.path.exists(w.local_dir):
                    shutil.rmtree(w.local_dir)
            c.stop()
Пример #55
0
def test_non_anonymous_access():
    with ignoring(NoCredentialsError):
        fs = S3FileSystem(anon=False)
        fs.ls('distributed-test')