def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes(test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=b'\n', s3=s3) _, values2 = read_bytes(test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=b'foo', s3=s3) assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes(test_bucket_name + '/test/accounts*', blocksize=blocksize, delimiter=d, s3=s3) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_blocksize_on_large_data(): _, L = read_bytes('dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv', blocksize=None) assert len(L) == 1 _, L = read_bytes('dask-data/nyc-taxi/2014/*.csv', blocksize=None) assert len(L) == 12
def test_read_bytes_sample_delimiter(s3): sample, values = read_bytes(test_bucket_name + "/test/accounts.*", s3=s3, sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes(test_bucket_name + "/test/accounts.1.json", s3=s3, sample=80, delimiter=b"\n") assert sample.endswith(b"\n") sample, values = read_bytes(test_bucket_name + "/test/accounts.1.json", s3=s3, sample=2, delimiter=b"\n") assert sample.endswith(b"\n")
def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'\n', s3=s3) _, values2 = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'foo', s3=s3) assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=d, s3=s3) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_modification_time_read_bytes(): with s3_context('compress', files) as s3: _, a = read_bytes('compress/test/accounts.*', s3=s3) _, b = read_bytes('compress/test/accounts.*', s3=s3) assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)] with s3_context('compress', valmap(double, files)) as s3: _, c = read_bytes('compress/test/accounts.*', s3=s3) assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
def test_read_bytes_sample_delimiter(s3): sample, values = read_bytes(test_bucket_name + '/test/accounts.*', s3=s3, sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes(test_bucket_name + '/test/accounts.1.json', s3=s3, sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes(test_bucket_name + '/test/accounts.1.json', s3=s3, sample=2, delimiter=b'\n') assert sample.endswith(b'\n')
def test_registered(s3): from dask.bytes.core import read_bytes sample, values = read_bytes("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_write_bytes(s3): paths = ["s3://" + test_bucket_name + "/more/" + f for f in files] values = [delayed(v) for v in files.values()] out = core.write_bytes(values, paths, s3=s3) compute(*out) sample, values = read_bytes(test_bucket_name + "/more/test/accounts.*", s3=s3) results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_write_bytes(s3): paths = ['s3://' + test_bucket_name + '/more/' + f for f in files] values = [delayed(v) for v in files.values()] out = core.write_bytes(values, paths, s3=s3) compute(*out) sample, values = read_bytes(test_bucket_name+'/more/test/accounts.*', s3=s3) results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_write_bytes(s3): paths = ['s3://' + test_bucket_name + '/more/' + f for f in files] values = list(files.values()) out = core.write_bytes(values, paths, s3=s3) compute(*out) sample, values = read_bytes(test_bucket_name+'/more/test/accounts.*', s3=s3) results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_compression(s3, fmt, blocksize): with s3_context('compress', valmap(compress[fmt], files)) as s3: sample, values = read_bytes('compress/test/accounts.*', s3=s3, compression=fmt, blocksize=blocksize) assert sample.startswith(files[sorted(files)[0]][:10]) results = compute(*concat(values)) assert b''.join(results) == b''.join([files[k] for k in sorted(files)])
def test_registered(s3): from dask.bytes.core import read_bytes sample, values = read_bytes('s3://%s/test/accounts.*.json' % test_bucket_name, s3=s3) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes_block(s3, blocksize): _, vals = read_bytes(test_bucket_name + "/test/account*", blocksize=blocksize, s3=s3) assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum(len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_read_bytes(s3): sample, values = read_bytes(test_bucket_name+'/test/accounts.*', s3=s3) assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], 'dask') assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes(s3): sample, values = read_bytes(test_bucket_name + '/test/accounts.*', s3=s3) assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], 'dask') assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes_block(s3, blocksize): _, vals = read_bytes(test_bucket_name+'/test/account*', blocksize=blocksize, s3=s3) assert (list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()]) results = compute(*concat(vals)) assert (sum(len(r) for r in results) == sum(len(v) for v in files.values())) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_read_bytes_blocksize_none(s3): _, values = read_bytes(test_bucket_name+'/test/accounts.*', blocksize=None, s3=s3) assert sum(map(len, values)) == len(files)
def test_read_bytes_non_existing_glob(s3): with pytest.raises(IOError): read_bytes(test_bucket_name+'/non-existing/*', s3=s3)
def test_read_bytes_blocksize_none(s3): _, values = read_bytes(test_bucket_name + '/test/accounts.*', blocksize=None, s3=s3) assert sum(map(len, values)) == len(files)
def test_read_bytes_non_existing_glob(s3): with pytest.raises(IOError): read_bytes(test_bucket_name + '/non-existing/*', s3=s3)