Exemplo n.º 1
0
def test_read_text(fmt, bs, encoding):
    if fmt == "zip" and sys.version_info.minor == 5:
        pytest.skip("zipfile is read-only on py35")
    if fmt not in utils.compress:
        pytest.skip("compress function not provided for %s" % fmt)
    compress = utils.compress[fmt]
    files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items())
    with filetexts(files2, mode="b"):
        b = read_text(".test.accounts.*.json",
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        L, = compute(b)
        assert "".join(L) == expected

        b = read_text(sorted(files),
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        L, = compute(b)
        assert "".join(L) == expected

        blocks = read_text(
            ".test.accounts.*.json",
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            collection=False,
        )
        L = compute(*blocks)
        assert "".join(line for block in L for line in block) == expected
Exemplo n.º 2
0
def test_read_text(fmt, bs, encoding):
    compress = compression.compress[fmt]
    files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items())
    with filetexts(files2, mode="b"):
        b = read_text(".test.accounts.*.json",
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        L, = compute(b)
        assert "".join(L) == expected

        b = read_text(sorted(files),
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        L, = compute(b)
        assert "".join(L) == expected

        blocks = read_text(
            ".test.accounts.*.json",
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            collection=False,
        )
        L = compute(*blocks)
        assert "".join(line for block in L for line in block) == expected
Exemplo n.º 3
0
def test_files_per_partition():
    files3 = {f"{n:02}.txt": "line from {:02}" for n in range(20)}
    with filetexts(files3):
        # single-threaded scheduler to ensure the warning happens in the
        # same thread as the pytest.warns
        with config.set({"scheduler": "single-threaded"}):
            with pytest.warns(UserWarning):
                b = read_text("*.txt", files_per_partition=10)
                l = len(b.take(100, npartitions=1))

            assert l == 10, "10 files should be grouped into one partition"

            assert b.count().compute() == 20, "All 20 lines should be read"

            with pytest.warns(UserWarning):
                b = read_text("*.txt",
                              files_per_partition=10,
                              include_path=True)
                p = b.take(100, npartitions=1)

            p_paths = tuple(zip(*p))[1]
            p_unique_paths = set(p_paths)
            assert len(p_unique_paths) == 10

            b_paths = tuple(zip(*b.compute()))[1]
            b_unique_paths = set(b_paths)
            assert len(b_unique_paths) == 20
Exemplo n.º 4
0
def test_errors():
    with filetexts({".test.foo": b"Jos\xe9\nAlice"}, mode="b"):
        with pytest.raises(UnicodeDecodeError):
            read_text(".test.foo", encoding="ascii").compute()

        result = read_text(".test.foo", encoding="ascii", errors="ignore")
        result = result.compute(scheduler="sync")
        assert result == ["Jos\n", "Alice"]
Exemplo n.º 5
0
def test_errors():
    with filetexts({'.test.foo': b'Jos\xe9\nAlice'}, mode='b'):
        with pytest.raises(UnicodeDecodeError):
            read_text('.test.foo', encoding='ascii').compute()

        result = read_text('.test.foo', encoding='ascii', errors='ignore')
        result = result.compute(scheduler='sync')
        assert result == ['Jos\n', 'Alice']
Exemplo n.º 6
0
def test_errors():
    with filetexts({'.test.foo': b'Jos\xe9\nAlice'}, mode='b'):
        with pytest.raises(UnicodeDecodeError):
            read_text('.test.foo', encoding='ascii').compute()

        result = read_text('.test.foo', encoding='ascii', errors='ignore')
        result = result.compute(get=get)
        assert result == ['Jos\n', 'Alice']
Exemplo n.º 7
0
def test_errors():
    with filetexts({'.test.foo': 'Jos\xe9\nAlice'}):
        with pytest.raises(UnicodeDecodeError):
            read_text('.test.foo', encoding='ascii').compute()

        result = read_text('.test.foo', encoding='ascii', errors='ignore')
        result = result.compute(get=get)
        assert result == ['Jos\n', 'Alice']
Exemplo n.º 8
0
def test_read_text(fmt, bs, encoding):
    compress = compression.compress[fmt]
    files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items())
    with filetexts(files2, mode='b'):
        b = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs,
                encoding=encoding)
        L, = compute(b)
        assert ''.join(L) == expected

        blocks = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs,
                encoding=encoding, collection=False)
        L = compute(*blocks)
        assert ''.join(line for block in L for line in block) == expected
Exemplo n.º 9
0
def test_complex_delimiter():
    longstr = "abc\ndef\n123\n$$$$\ndog\ncat\nfish\n\n\r\n$$$$hello"
    with filetexts({".test.delim.txt": longstr}):
        assert read_text(".test.delim.txt",
                         linedelimiter="$$$$").count().compute() == 3
        assert (read_text(".test.delim.txt", linedelimiter="$$$$",
                          blocksize=2).count().compute() == 3)
        vals = read_text(".test.delim.txt", linedelimiter="$$$$").compute()
        assert vals[-1] == "hello"
        assert vals[0].endswith("$$$$")
        vals = read_text(".test.delim.txt", linedelimiter="$$$$",
                         blocksize=2).compute()
        assert vals[-1] == "hello"
        assert vals[0].endswith("$$$$")
Exemplo n.º 10
0
def test_files_per_partition():
    files3 = {'{:02}.txt'.format(n): 'line from {:02}' for n in range(20)}
    with filetexts(files3):
        b = read_text('*.txt', files_per_partition=10)

        l = len(b.take(100, npartitions=1))
        assert l == 10, "10 files should be grouped into one partition"

        assert b.count().compute() == 20, "All 20 lines should be read"
Exemplo n.º 11
0
def test_files_per_partition():
    files3 = {'{:02}.txt'.format(n): 'line from {:02}' for n in range(20)}
    with filetexts(files3):
        b = read_text('*.txt', files_per_partition=10)

        l = len(b.take(100, npartitions=1))
        assert l == 10, "10 files should be grouped into one partition"

        assert b.count().compute() == 20, "All 20 lines should be read"
Exemplo n.º 12
0
def test_read_text_unicode_no_collection(tmp_path):
    data = b"abcd\xc3\xa9"
    fn = tmp_path / "data.txt"
    with open(fn, "wb") as f:
        f.write(b"\n".join([data, data]))

    f = read_text(fn, collection=False)

    result = f[0].compute()
    assert len(result) == 2
Exemplo n.º 13
0
def test_read_text(fmt, bs, encoding, include_path):
    if fmt not in utils.compress:
        pytest.skip("compress function not provided for %s" % fmt)
    compress = utils.compress[fmt]
    files2 = {k: compress(v.encode(encoding)) for k, v in files.items()}
    with filetexts(files2, mode="b"):
        b = read_text(".test.accounts.*.json",
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        (L, ) = compute(b)
        assert "".join(L) == expected

        o = read_text(
            sorted(files),
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            include_path=include_path,
        )
        b = o.pluck(0) if include_path else o
        (L, ) = compute(b)
        assert "".join(L) == expected
        if include_path:
            (paths, ) = compute(o.pluck(1))
            expected_paths = list(
                concat([[k] * v.count("\n") for k, v in files.items()]))
            assert len(paths) == len(expected_paths)
            for path, expected_path in zip(paths, expected_paths):
                assert path.endswith(expected_path)

        blocks = read_text(
            ".test.accounts.*.json",
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            collection=False,
        )
        L = compute(*blocks)
        assert "".join(line for block in L for line in block) == expected
Exemplo n.º 14
0
def test_files_per_partition():
    files3 = {"{:02}.txt".format(n): "line from {:02}" for n in range(20)}
    with filetexts(files3):
        # single-threaded scheduler to ensure the warning happens in the
        # same thread as the pytest.warns
        with dask.config.set({"scheduler": "single-threaded"}):
            with pytest.warns(UserWarning):
                b = read_text("*.txt", files_per_partition=10)
                l = len(b.take(100, npartitions=1))

            assert l == 10, "10 files should be grouped into one partition"

            assert b.count().compute() == 20, "All 20 lines should be read"