Пример #1
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False, loop=loop)
        yield e._start()

        x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3)
                                                       for j in range(2)}
        y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2)
                                                       for j in range(3)}
        x_futures = yield e._scatter(x_dsk)
        y_futures = yield e._scatter(y_dsk)

        dt = np.random.random(0).dtype
        x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
        y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

        x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
        y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

        exprs = [lambda x, y: x.T + y,
                 lambda x, y: x.mean() + y.mean(),
                 lambda x, y: x.dot(y).std(axis=0),
                 lambda x, y: x - x.mean(axis=1)[:, None]]

        for expr in exprs:
            local = expr(x_local, y_local)
            local_results = dask.get(local.dask, local._keys())
            local_result = da.Array._finalize(local, local_results)

            remote = expr(x_remote, y_remote)
            remote_results = yield e._get(remote.dask, remote._keys())
            remote_result = da.Array._finalize(remote, remote_results)

            assert np.all(local_result == remote_result)

        yield e._shutdown()
Пример #2
0
def test_read_text(s, a, b):
    pytest.importorskip('dask.bag')
    import dask.bag as db
    from dask.imperative import Value
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    b = read_text(test_bucket_name, 'test/accounts', lazy=True,
                  collection=True, anon=True)
    assert isinstance(b, db.Bag)
    yield gen.sleep(0.2)
    assert not s.tasks

    future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum())
    result = yield future._result()

    assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100

    text = read_text(test_bucket_name, 'test/accounts', lazy=True,
                     collection=False, anon=True)
    assert all(isinstance(v, Value) for v in text)

    text = read_text(test_bucket_name, 'test/accounts', lazy=False,
                     collection=False, anon=True)
    assert all(isinstance(v, Future) for v in text)

    yield e._shutdown()
Пример #3
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' +
                b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
Пример #4
0
def test__dask_array_collections(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3)
                                                   for j in range(2)}
    y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2)
                                                   for j in range(3)}
    x_futures = yield e._scatter(x_dsk)
    y_futures = yield e._scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

    exprs = [lambda x, y: x.T + y,
             lambda x, y: x.mean() + y.mean(),
             lambda x, y: x.dot(y).std(axis=0),
             lambda x, y: x - x.mean(axis=1)[:, None]]

    for expr in exprs:
        local = expr(x_local, y_local).compute(get=dask.get)

        remote = e.compute(expr(x_remote, y_remote))
        remote = yield remote._result()

        assert np.all(local == remote)

    yield e._shutdown()
Пример #5
0
def test_with_data(s, a, b):
    ss = HTTPScheduler(s)
    ss.listen(0)

    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = e.map(inc, [1, 2, 3])
    L2 = yield e._scatter(['Hello', 'world!'])
    yield _wait(L)

    client = AsyncHTTPClient()
    response = yield client.fetch('http://localhost:%s/memory-load.json' %
                                  ss.port)
    out = json.loads(response.body.decode())

    assert all(isinstance(v, int) for v in out.values())
    assert set(out) == {a.address_string, b.address_string}
    assert sum(out.values()) == sum(map(sys.getsizeof,
                                        [1, 2, 3, 'Hello', 'world!']))

    response = yield client.fetch('http://localhost:%s/memory-load-by-key.json'
                                  % ss.port)
    out = json.loads(response.body.decode())
    assert set(out) == {a.address_string, b.address_string}
    assert all(isinstance(v, dict) for v in out.values())
    assert all(k in {'inc', 'data'} for d in out.values() for k in d)
    assert all(isinstance(v, int) for d in out.values() for v in d.values())

    assert sum(v for d in out.values() for v in d.values()) == \
            sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!']))

    ss.stop()
    yield e._shutdown()
Пример #6
0
def test_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        while True:
            line = proc.stderr.readline()
            if b'Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                for name in [socket.gethostname(), 'localhost', '127.0.0.1', get_ip()]:
                    response = requests.get('http://%s:8787/status/' % name)
                    assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #7
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [lambda x: x.map(lambda x: x + 1).sum(),
             lambda x: x.filter(lambda x: x % 2)]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
Пример #8
0
def test_bokeh_non_standard_ports():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler',
                      '--port', '3448',
                      '--http-port', '4824',
                      '--bokeh-port', '4832'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:3448')

        while True:
            line = proc.stderr.readline()
            if b'Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                response = requests.get('http://localhost:4832/status/')
                assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #9
0
def test_avro(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    avro_files = {
        '/tmp/test/1.avro': avro_bytes,
        '/tmp/test/2.avro': avro_bytes
    }

    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'wb') as f:
                f.write(v)

            assert hdfs.info(k)['size'] > 0

        L = yield _read_avro('/tmp/test/*.avro', lazy=False)
        assert isinstance(L, list)
        assert all(isinstance(x, Future) for x in L)

        results = yield e._gather(L)
        assert all(isinstance(r, list) for r in results)
        assert results[0][:5] == data[:5]
        assert results[-1][-5:] == data[-5:]

        L = yield _read_avro('/tmp/test/*.avro', lazy=True)
        assert isinstance(L, list)
        assert all(isinstance(x, Value) for x in L)

    yield e._shutdown()
Пример #10
0
def test_procs(loop):
    with LocalCluster(2,
                      scheduler_port=0,
                      nanny=False,
                      threads_per_worker=3,
                      diagnostic_port=None,
                      silence_logs=False) as c:
        assert len(c.workers) == 2
        assert all(isinstance(w, Worker) for w in c.workers)
        with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e:
            assert all(w.ncores == 3 for w in c.workers)
        repr(c)

    with LocalCluster(2,
                      scheduler_port=0,
                      nanny=True,
                      threads_per_worker=3,
                      diagnostic_port=None,
                      silence_logs=False) as c:
        assert len(c.workers) == 2
        assert all(isinstance(w, Nanny) for w in c.workers)
        with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e:
            assert all(v == 3 for v in e.ncores().values())

            c.start_worker(nanny=False)
            assert isinstance(c.workers[-1], Worker)
        repr(c)
Пример #11
0
def test_avro(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    avro_files = {'/tmp/test/1.avro': avro_bytes,
                  '/tmp/test/2.avro': avro_bytes}

    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'w') as f:
                f.write(v)

            assert hdfs.info(k)['size'] > 0

        L = yield _read_avro('/tmp/test/*.avro', lazy=False)
        assert isinstance(L, list)
        assert all(isinstance(x, Future) for x in L)

        results = yield e._gather(L)
        assert all(isinstance(r, list) for r in results)
        assert results[0][:5] == data[:5]
        assert results[-1][-5:] == data[-5:]

        L = yield _read_avro('/tmp/test/*.avro', lazy=True)
        assert isinstance(L, list)
        assert all(isinstance(x, Value) for x in L)
Пример #12
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
Пример #13
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #14
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #15
0
def test_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        while True:
            line = proc.stderr.readline()
            if b'Start Bokeh UI' in line:
                break

        start = time()
        while True:
            try:
                for name in [
                        socket.gethostname(), 'localhost', '127.0.0.1',
                        get_ip()
                ]:
                    response = requests.get('http://%s:8787/status/' % name)
                    assert response.ok
                break
            except:
                sleep(0.1)
                assert time() < start + 5

    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #16
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
Пример #17
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
Пример #18
0
def test_hostport():
    try:
        proc = Popen(['dask-scheduler', '--no-bokeh',
                      '--host', '127.0.0.1:8978'],
                     stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:8978')
    finally:
        e.shutdown()
        os.kill(proc.pid, signal.SIGINT)
Пример #19
0
def test_read_bytes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    futures = read_bytes(test_bucket_name, prefix='test/', anon=True)
    assert len(futures) >= len(files)
    results = yield e._gather(futures)
    assert set(results).issuperset(set(files.values()))

    yield e._shutdown()
Пример #20
0
def test_defaults():
    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        response = requests.get('http://127.0.0.1:9786/info.json')
        assert response.ok
        assert response.json()['status'] == 'running'
    finally:
        e.shutdown()
        os.kill(proc.pid, signal.SIGINT)
Пример #21
0
def test_defaults():
    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)

        response = requests.get('http://127.0.0.1:9786/info.json')
        assert response.ok
        assert response.json()['status'] == 'running'
    finally:
        e.shutdown()
        os.kill(proc.pid, signal.SIGINT)
Пример #22
0
def test_read_csv_with_names(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = yield _read_csv('/tmp/test/*.csv', names=['amount', 'name'],
                             lineterminator='\n', lazy=False)
        assert list(df.columns) == ['amount', 'name']

        yield e._shutdown()
Пример #23
0
def test_read_bytes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    futures = read_bytes(test_bucket_name,
                         prefix='test/',
                         anon=True,
                         lazy=False)
    assert len(futures) >= len(files)
    results = yield e._gather(futures)
    assert set(results).issuperset(set(files.values()))

    yield e._shutdown()
Пример #24
0
def test_read_bytes_lazy(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True)
    assert all(isinstance(v, Value) for v in values)

    results = e.compute(values, sync=False)
    results = yield e._gather(results)

    assert set(results).issuperset(set(files.values()))

    yield e._shutdown()
Пример #25
0
def test_no_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)
        for i in range(3):
            assert b'bokeh' not in next(proc.stderr)
    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #26
0
    def _get_executor(self):
        loop = tornado.ioloop.IOLoop.current()

        IP = '127.0.0.1'
        PORT = 63000
        PORT_SCHEDULER = 63500

        from distributed import Executor
        executor = Executor('{}:{}'.format(IP, PORT_SCHEDULER), loop=loop,
                            start=False)
        yield executor._start()

        return executor
Пример #27
0
def test_no_bokeh():
    pytest.importorskip('bokeh')

    try:
        proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE)
        e = Executor('127.0.0.1:%d' % Scheduler.default_port)
        for i in range(3):
            assert b'bokeh' not in next(proc.stderr)
    finally:
        with ignoring(Exception):
            e.shutdown()
        with ignoring(Exception):
            os.kill(proc.pid, signal.SIGINT)
Пример #28
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False)
        IOLoop.current().spawn_callback(e._go)

        remote_dfs = e.map(lambda x: x, dfs)
        ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True)

        assert isinstance(ddf, dd.DataFrame)
        assert ddf.divisions == (0, 30, 60, 80)
        expr = ddf.x.sum()
        result = yield e._get(expr.dask, expr._keys())
        assert result == [sum([df.x.sum() for df in dfs])]

        yield e._shutdown()
Пример #29
0
def test_read_csv(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n')
        result, = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Пример #30
0
def test__futures_to_dask_dataframe(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True,
            executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]

    yield e._shutdown()
Пример #31
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_csv('/tmp/test/*.csv',
                                   lineterminator='\n',
                                   collection=False,
                                   lazy=False,
                                   header=0)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = read_csv('/tmp/test/*.csv',
                              lineterminator='\n',
                              collection=True,
                              lazy=False,
                              header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Пример #32
0
def dsubmit(*a, args=(), kwargs=None, rtn='', **kw):
    """Returns a distributed submission context manager, DSubmitter(),
    with a new executor instance.

    Parameters
    ----------
    args : Sequence of str, optional
        A tuple of argument names for DSubmitter.
    kwargs : Mapping of str to values or list of item tuples, optional
        Keyword argument names and values for DSubmitter.
    rtn : str, optional
        Name of object to return for DSubmitter.
    a, kw : Sequence and Mapping
        All other arguments and keyword arguments are used to construct
        the executor instance.

    Returns
    -------
    dsub : DSubmitter
        An instance of the DSubmitter context manager.
    """
    from distributed import Executor
    e = Executor(*a, **kw)
    dsub = DSubmitter(e, args=args, kwargs=kwargs, rtn=rtn)
    return dsub
Пример #33
0
def test_nanny_worker_ports(loop):
    try:
        worker = Popen([
            'dworker', '127.0.0.1:8989', '--host', '127.0.0.1',
            '--worker-port', '8788', '--nanny-port', '8789'
        ],
                       stdout=PIPE,
                       stderr=PIPE)
        sched = Popen(['dscheduler', '--port', '8989'],
                      stdout=PIPE,
                      stderr=PIPE)
        with Executor('127.0.0.1:8989', loop=loop) as e:
            start = time()
            while True:
                d = sync(e.loop, e.scheduler.identity)
                if d['workers']:
                    break
                else:
                    assert time() - start < 5
                    sleep(0.1)
            assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789
    finally:
        with ignoring(Exception):
            w = rpc('127.0.0.1:8789')
            sync(loop, w.terminate)

        with ignoring(Exception):
            os.kill(sched.pid, signal.SIGINT)

        with ignoring(Exception):
            worker.kill()
Пример #34
0
def client_context(dask_client=None, dask_scheduler=None):
    '''client_context creates a dask distributed or threadpool client or None

    Parameters:
        dask_client:     str from choices ("DISTRIBUTED", 'THREAD_POOL', 'SERIAL')
                         or None to take DASK_CLIENT from environment
        dask_scheduler:  Distributed scheduler url or None to take
                         DASK_SCHEDULER from environment
    '''
    env = parse_env_vars()
    dask_client = dask_client or env.get('DASK_CLIENT', 'SERIAL')
    dask_scheduler = dask_scheduler or env.get('DASK_SCHEDULER')
    if dask_client == 'DISTRIBUTED':
        if Executor is None:
            raise ValueError('distributed is not installed - "conda install distributed"')
        client = Executor(dask_scheduler)
    elif dask_client == 'THREAD_POOL':
        client = ThreadPool(env.get('DASK_THREADS'))
    elif dask_client == 'SERIAL':
        client = None
    else:
        raise ValueError('Did not expect DASK_CLIENT to be {}'.format(dask_client))
    get_func = _find_get_func_for_client(client)
    with da.set_options(pool=dask_client):
       yield client
Пример #35
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df['A'] = df.A // 0.1
    df['B'] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            with dask.set_options(get=e.get):
                for ind in [lambda x: 'A', lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[['A', 'B']]).apply(len)

                a = df.groupby(['A', 'B']).apply(len)
                b = ddf.groupby(['A', 'B']).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
Пример #36
0
def test_read_csv(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv',
                             header=True,
                             lineterminator='\n')
        result, = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Пример #37
0
def test_gather_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = e.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            result = e.gather(L)
            assert result == list(map(inc, range(10)))
Пример #38
0
def test_submit_after_failed_worker(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = e.map(inc, range(10))
            wait(L)
            a['proc'].terminate()
            total = e.submit(sum, L)
            assert total.result() == sum(map(inc, range(10)))
Пример #39
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False, loop=loop)
        yield e._start()

        arrays = e.map(np.ones, [(5, 5)] * 6)
        y = yield _stack(arrays, axis=0)
        assert y.shape == (6, 5, 5)
        assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,))

        y_results = yield e._get(y.dask, y._keys())
        yy = da.Array._finalize(y, y_results)

        assert isinstance(yy, np.ndarray)
        assert yy.shape == y.shape
        assert (yy == 1).all()

        yield e._shutdown()
Пример #40
0
def test_read_text_bucket_key_inputs(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            a = read_text(test_bucket_name, '/text/accounts', lazy=True)
            b = read_text(test_bucket_name, 'text/accounts', lazy=True)
            c = read_text(test_bucket_name + '/text/accounts', lazy=True)

            assert a._keys() == b._keys() == c._keys()
Пример #41
0
def test_stress_gc(loop, func, n):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            x = e.submit(func, 1)
            for i in range(n):
                x = e.submit(func, x)

            assert x.result() == n + 2
    def test_framework_runs(self):
        with MesosCluster() as cluster:
            time.sleep(2)
            driver = DistributedDriver().create_driver(DistributedScheduler)
            driver.start()
            time.sleep(5)

            expect(cluster).to(have_activated_slaves(1))
            expect(cluster).to(have_framework_name('distributed-framework'))

            # distributed test - this probably doesnt belong here
            executor = Executor('127.0.0.1:8787')
            A = executor.map(lambda x: x**2, range(10))
            B = executor.map(lambda x: -x, A)
            total = executor.submit(sum, B)
            expect(total.result()).to(equal(-285))
            driver.stop()
Пример #43
0
def test_progress_function(loop, capsys):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            f = e.submit(lambda: 1)
            g = e.submit(lambda: 2)

            progress([[f], [[g]]], notebook=False)
            check_bar_completed(capsys)
Пример #44
0
def test__read_text_unicode(s, a, b):
    fn = '/tmp/test/data.txt'
    data = b'abcd\xc3\xa9'
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = yield _read_text(fn, collection=False, lazy=False)
        result = yield f[0]._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0]) == 5

        yield e._shutdown()
Пример #45
0
def test__read_text_unicode(s, a, b):
    fn = '/tmp/test/data.txt'
    data = b'abcd\xc3\xa9'
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = yield _read_text(fn, collection=False, lazy=False)
        result = yield f[0]._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0]) == 5

        yield e._shutdown()
Пример #46
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
Пример #47
0
def test_Executor_with_local(loop):
    with LocalCluster(1,
                      scheduler_port=0,
                      silence_logs=False,
                      diagnostic_port=None,
                      loop=loop) as c:
        with Executor(c, loop=loop) as e:
            assert len(e.ncores()) == len(c.workers)
            assert c.scheduler_address in repr(e)
Пример #48
0
def test_futures_to_dask_dataframe(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            remote_dfs = e.map(lambda x: x, dfs)
            ddf = futures_to_dask_dataframe(remote_dfs, divisions=True)

            assert isinstance(ddf, dd.DataFrame)
            assert ddf.x.sum().compute(get=e.get) == sum(
                [df.x.sum() for df in dfs])
Пример #49
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                b = read_text('/tmp/test/*.txt', lazy=False)
                assert list(b.str.upper()) == ['HELLO', 'WORLD']
Пример #50
0
def test__stack(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    arrays = e.map(np.ones, [(5, 5)] * 6)
    y = yield _stack(arrays, axis=0)
    assert y.shape == (6, 5, 5)
    assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,))

    y_result = e.compute(y)
    yy = yield y_result._result()

    assert isinstance(yy, np.ndarray)
    assert yy.shape == y.shape
    assert (yy == 1).all()

    yield e._shutdown()
Пример #51
0
def test_restart_sync_no_center(loop):
    with cluster(nanny=True) as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            x = e.submit(inc, 1)
            e.restart()
            assert x.cancelled()
            y = e.submit(inc, 2)
            assert y.result() == 3
            assert len(e.ncores()) == 2
Пример #52
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False, loop=loop)
        yield e._start()

        remote_arrays = [[[e.submit(np.full, (2, 3, 4), i + j + k)
                            for i in range(2)]
                            for j in range(2)]
                            for k in range(4)]

        x = yield _futures_to_dask_array(remote_arrays, executor=e)
        assert x.chunks == ((2, 2, 2, 2), (3, 3), (4, 4))
        assert x.dtype == np.full((), 0).dtype

        assert isinstance(x, da.Array)
        expr = x.sum()
        result = yield e._get(expr.dask, expr._keys())
        assert isinstance(result[0], np.number)

        yield e._shutdown()
Пример #53
0
def test_futures_to_dask_arrays(loop):
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            futures = e.map(np.ones, [(5, i) for i in range(1, 6)])
            x = future_to_dask_array(futures[0])
            assert x.shape == (5, 1)
            assert (x.compute(get=e.get) == 1).all()

            xs = futures_to_dask_arrays(futures)
            assert [x.shape for x in xs] == [(5, i) for i in range(1, 6)]
Пример #54
0
def test_write_bytes(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        data = [b'123', b'456', b'789']
        remote_data = yield e._scatter(data)

        futures = write_bytes('/tmp/test/data/file.*.dat', remote_data, hdfs=hdfs)
        yield _wait(futures)

        assert len(hdfs.ls('/tmp/test/data/')) == 3
        with hdfs.open('/tmp/test/data/file.1.dat') as f:
            assert f.read() == b'456'


        futures = write_bytes('/tmp/test/data2/', remote_data, hdfs=hdfs)
        yield _wait(futures)

        assert len(hdfs.ls('/tmp/test/data2/')) == 3
Пример #55
0
def test_read_bytes(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'w', repl=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs)
        assert len(futures) == len(blocks)
        assert futures[0].executor is e
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
        assert {f.key for f in futures}.issubset(s.loose_restrictions)
Пример #56
0
def test_get_block_locations_nested(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        L =  get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes('/tmp/test/', hdfs=hdfs)
        results = yield e._gather(futures)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #57
0
def test_read_csv_lazy(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv', lazy=True,
                             lineterminator='\n')
        assert df._known_dtype
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)._result()
        assert result == 1 + 2 + 3 + 4

        yield e._shutdown()