Пример #1
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'abc\n' * int(1e3)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c')

        assert [f.key for f in x] == [f.key for f in y]
        assert [f.key for f in x] != [f.key for f in z]
Пример #2
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'abc\n' * int(1e3)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c')

        assert [f.key for f in x] == [f.key for f in y]
        assert [f.key for f in x] != [f.key for f in z]
Пример #3
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #4
0
def test_read_bytes(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes(fn, hdfs=hdfs)
        assert sample[:5] == b'aaaaa'
        assert len(values) == len(blocks)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values} == set(s.restrictions)
        assert {v.key for v in values} == set(s.loose_restrictions)

        futures = e.compute(values)
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
Пример #5
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'wb') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n', lazy=False)
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1

        yield e._shutdown()
Пример #6
0
def test_read_bytes(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes(fn, hdfs=hdfs)
        assert sample[:5] == b'aaaaa'
        assert len(values) == len(blocks)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values} == set(s.restrictions)
        assert {v.key for v in values} == set(s.loose_restrictions)

        futures = e.compute(values)
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
Пример #7
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' +
                b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
Пример #8
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        results = e.compute(values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)

        yield e._shutdown()
Пример #9
0
def test_read_bytes_sync(loop):
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as hdfs:
            data = b'a' * int(1e3)

            for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_bytes('/tmp/test/file.*', lazy=False)
                results = e.gather(futures)
                assert b''.join(results) == 100 * data
Пример #10
0
def test_read_bytes_sync(loop):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e3)

        for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
            with hdfs.open(fn, 'w', repl=1) as f:
                f.write(data)

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_bytes('/tmp/test/file.*')
                results = e.gather(futures)
                assert b''.join(results) == 100 * data
Пример #11
0
def test_write_bytes(e, s, a, b):
    from dask.bytes.core import write_bytes, read_bytes
    with make_hdfs() as hdfs:
        path = 'hdfs:///tmp/test/'
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path, hdfs=hdfs)
        futures = e.compute(out)
        results = yield e._gather(futures)
        assert len(hdfs.ls('/tmp/test/')) == 5

        sample, vals = read_bytes('hdfs:///tmp/test/*.part',
                                    hdfs=hdfs, lazy=True)
        futures = e.compute(vals)
        results = yield e._gather(futures)
        assert data == results
Пример #12
0
def test_write_bytes(e, s, a, b):
    from dask.bytes.core import write_bytes, read_bytes
    with make_hdfs() as hdfs:
        path = 'hdfs:///tmp/test/'
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path, hdfs=hdfs)
        futures = e.compute(out)
        results = yield e._gather(futures)
        assert len(hdfs.ls('/tmp/test/')) == 5

        sample, vals = read_bytes('hdfs:///tmp/test/*.part',
                                    hdfs=hdfs, lazy=True)
        futures = e.compute(vals)
        results = yield e._gather(futures)
        assert data == results
Пример #13
0
def test_read_bytes(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        futures = read_bytes(fn, hdfs=hdfs, lazy=False)
        assert len(futures) == len(blocks)
        assert futures[0].executor is e
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
        assert {f.key for f in futures}.issubset(s.loose_restrictions)
Пример #14
0
def test_get_block_locations_nested(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        L =  get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6

        futures = read_bytes('/tmp/test/', hdfs=hdfs, lazy=False)
        results = yield e._gather(futures)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #15
0
def test_read_bytes(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        futures = read_bytes(fn, hdfs=hdfs, lazy=False)
        assert len(futures) == len(blocks)
        assert futures[0].executor is e
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
        assert {f.key for f in futures}.issubset(s.loose_restrictions)
Пример #16
0
def test_get_block_locations_nested(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        L = get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6

        futures = read_bytes('/tmp/test/', hdfs=hdfs, lazy=False)
        results = yield e._gather(futures)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #17
0
def test_get_block_locations_nested(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        L =  get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes('/tmp/test/', hdfs=hdfs)
        results = yield e._gather(futures)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #18
0
def test_lazy_values(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        sample, values = read_bytes('/tmp/test/', hdfs=hdfs)
        assert all(isinstance(v, Delayed) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        results = e.compute(values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Пример #19
0
def test_lazy_values(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        sample, values = read_bytes('/tmp/test/', hdfs=hdfs)
        assert all(isinstance(v, Delayed) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        results = e.compute(values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)