def test_modification_time_open_files(open_files): with filetexts(files, mode='b'): a = open_files('.test.accounts.*') b = open_files('.test.accounts.*') assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) double = lambda x: x + x with filetexts(valmap(double, files), mode='b'): c = open_files('.test.accounts.*') assert [aa._key for aa in a] != [cc._key for cc in c]
def test_registered_open_files(): from dask.bytes.core import open_files with filetexts(files, mode='b'): myfiles = open_files('.test.accounts.*') assert len(myfiles) == len(files) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def test_compression_binary(fmt): from dask.bytes.core import open_files files2 = valmap(compression.compress[fmt], files) with filetexts(files2, mode='b'): myfiles = open_files('.test.accounts.*', compression=fmt) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def test_bad_compression(): from dask.bytes.core import read_bytes, open_files, open_text_files with filetexts(files, mode='b'): for func in [read_bytes, open_files, open_text_files]: with pytest.raises(ValueError): sample, values = func('.test.accounts.*', compression='not-found')
def test_read_bytes_delimited(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'\n') _, values2 = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_blocksize_types(blocksize): with filetexts(files, mode='b'): sample, vals = read_bytes('.test.account*', blocksize=blocksize) results = compute(*concat(vals)) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def test_registered_read_bytes(): from dask.bytes.core import read_bytes with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') results = compute(*concat(values)) assert set(results) == set(files.values())
def test_skiprows(dd_read, pd_read, files): files = {name: comment_header + b'\n' + content for name, content in files.items()} skip = len(comment_header.splitlines()) with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', skiprows=skip) expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)]) assert_eq(df, expected_df, check_dtype=False)
def test_categorical_dtypes(): text1 = normalize_text(""" fruit,count apple,10 apple,25 pear,100 orange,15 """) text2 = normalize_text(""" fruit,count apple,200 banana,300 orange,400 banana,10 """) with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}): df = dd.read_csv('foo.*.csv', dtype={'fruit': 'category'}, blocksize=25) assert df.fruit.dtype == 'category' assert not has_known_categories(df.fruit) res = df.compute() assert res.fruit.dtype == 'category' assert (sorted(res.fruit.cat.categories) == ['apple', 'banana', 'orange', 'pear'])
def test_header_None(): with filetexts({'.tmp.1.csv': '1,2', '.tmp.2.csv': '', '.tmp.3.csv': '3,4'}): df = dd.read_csv('.tmp.*.csv', header=None) expected = pd.DataFrame({0: [1, 3], 1: [2, 4]}) assert_eq(df.compute().reset_index(drop=True), expected)
def test_open_files(): with filetexts(files, mode='b'): myfiles = open_files('.test.accounts.*') assert len(myfiles) == len(files) for lazy_file, data_file in zip(myfiles, sorted(files)): with lazy_file as f: x = f.read() assert x == files[data_file]
def test_errors(): with filetexts({'.test.foo': b'Jos\xe9\nAlice'}, mode='b'): with pytest.raises(UnicodeDecodeError): read_text('.test.foo', encoding='ascii').compute() result = read_text('.test.foo', encoding='ascii', errors='ignore') result = result.compute(get=get) assert result == ['Jos\n', 'Alice']
def test_skiprows_as_list(dd_read, pd_read, files, units): files = {name: (comment_header + b'\n' + content.replace(b'\n', b'\n' + units, 1)) for name, content in files.items()} skip = [0, 1, 2, 3, 5] with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', skiprows=skip) expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)]) assert_eq(df, expected_df, check_dtype=False)
def test_read_csv_include_path_column_as_str(dd_read, files): with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', include_path_column='filename', converters={'filename': parse_filename}) filenames = df.filename.compute().unique() assert '2014-01-01.csv' in filenames assert '2014-01-02.csv' not in filenames assert '2014-01-03.csv' in filenames
def test_read_text(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert (set(line.strip() for line in db.read_text(fns)) == set('ABCD')) assert (set(line.strip() for line in db.read_text('a*.log')) == set('ABCD')) pytest.raises(ValueError, lambda: db.read_text('non-existent-*-path'))
def test_urlpath_expand_read(): """Make sure * is expanded in file paths when reading.""" # when reading, globs should be expanded to read files by mask with filetexts(csv_files, mode='b'): _, _, paths = get_fs_token_paths('.*.csv') assert len(paths) == 2 _, _, paths = get_fs_token_paths(['.*.csv']) assert len(paths) == 2
def test_from_filenames(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert set(line.strip() for line in db.from_filenames(fns)) == \ set('ABCD') assert set(line.strip() for line in db.from_filenames('a*.log')) == \ set('ABCD') assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
def test_read_csv_files(): with filetexts(files, mode='b'): df = read_csv('2014-01-*.csv') eq(df, expected, check_dtype=False) fn = '2014-01-01.csv' df = read_csv(fn) expected2 = pd.read_csv(BytesIO(files[fn])) eq(df, expected2, check_dtype=False)
def test_read_csv_files(dd_read, pd_read, files): with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv') assert_eq(df, expected, check_dtype=False) fn = '2014-01-01.csv' df = dd_read(fn) expected2 = pd_read(BytesIO(files[fn])) assert_eq(df, expected2, check_dtype=False)
def test_read_csv_files_list(dd_read, pd_read, files): with filetexts(files, mode='b'): subset = sorted(files)[:2] # Just first 2 sol = pd.concat([pd_read(BytesIO(files[k])) for k in subset]) res = dd_read(subset) assert_eq(res, sol, check_dtype=False) with pytest.raises(ValueError): dd_read([])
def test_robust_column_mismatch(): files = csv_files.copy() k = sorted(files)[-1] files[k] = files[k].replace(b'name', b'Name') with filetexts(files, mode='b'): ddf = dd.read_csv('2014-01-*.csv') df = pd.read_csv('2014-01-01.csv') assert (df.columns == ddf.columns).all() assert_eq(ddf, ddf)
def test_registered_open_files(): with filetexts(files, mode='b'): myfiles = open_files('.test.accounts.*') assert len(myfiles) == len(files) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k] for k in sorted(files)]
def test_compression_text(fmt): files2 = valmap(compression.compress[fmt], files) with filetexts(files2, mode='b'): myfiles = open_text_files('.test.accounts.*', compression=fmt) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k].decode() for k in sorted(files)]
def test_files_per_partition(): files3 = {'{:02}.txt'.format(n): 'line from {:02}' for n in range(20)} with filetexts(files3): b = read_text('*.txt', files_per_partition=10) l = len(b.take(100, npartitions=1)) assert l == 10, "10 files should be grouped into one partition" assert b.count().compute() == 20, "All 20 lines should be read"
def test_read_bytes_blocksize_float(): with filetexts(files, mode='b'): sample, vals = read_bytes('.test.account*', blocksize=5.0) results = compute(*concat(vals)) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines) with pytest.raises(TypeError): read_bytes('.test.account*', blocksize=5.5)
def test_with_paths(): pathlib = pytest.importorskip('pathlib') with filetexts(files, mode='b'): url = pathlib.Path('./.test.accounts.*') sample, values = read_bytes(url, blocksize=None) assert sum(map(len, values)) == len(files) with pytest.raises(OSError): # relative path doesn't work url = pathlib.Path('file://.test.accounts.*') read_bytes(url, blocksize=None)
def test_read_csv_include_path_column_is_dtype_category(dd_read, files): with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', include_path_column=True) assert df.path.dtype == 'category' assert has_known_categories(df.path) dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False) result = dfs[0].compute() assert result.path.dtype == 'category' assert has_known_categories(result.path)
def test_open_files_text_mode(encoding): with filetexts(files, mode='b'): myfiles = open_files('.test.accounts.*', mode='rt', encoding=encoding) assert len(myfiles) == len(files) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
def test_compression(fmt, blocksize): compress = compression.compress[fmt] files2 = valmap(compress, files) with filetexts(files2, mode='b'): sample, values = read_bytes('.test.accounts.*.json', blocksize=blocksize, delimiter=b'\n', compression=fmt) assert sample[:5] == files[sorted(files)[0]][:5] results = compute(*concat(values)) assert (b''.join(results) == b''.join([files[k] for k in sorted(files)]))
def test_read_bytes_sample_delimiter(): with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=80, delimiter=b'\n') assert sample.endswith(b'\n') sample, values = read_bytes('.test.accounts.1.json', sample=2, delimiter=b'\n') assert sample.endswith(b'\n')
def test_read_bytes_block(): with filetexts(files, mode="b"): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes(".test.account*", blocksize=bs) assert list( map(len, vals)) == [max((len(v) // bs), 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum( len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_compression(fmt, blocksize): compress = compression.compress[fmt] files2 = valmap(compress, files) with filetexts(files2, mode="b"): sample, values = read_bytes( ".test.accounts.*.json", blocksize=blocksize, delimiter=b"\n", compression=fmt, ) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def test_warn_non_seekable_files(capsys): files2 = valmap(compress['gzip'], files) with filetexts(files2, mode='b'): df = read_csv('2014-01-*.csv', compression='gzip') assert df.npartitions == 3 out, err = capsys.readouterr() assert 'gzip' in err assert 'blocksize=None' in err df = read_csv('2014-01-*.csv', compression='gzip', blocksize=None) out, err = capsys.readouterr() assert not err and not out with pytest.raises(NotImplementedError): df = read_csv('2014-01-*.csv', compression='foo')
def test_read_csv_compression(fmt, blocksize): if fmt not in compress: pytest.skip("compress function not provided for %s" % fmt) files2 = valmap(compress[fmt], csv_files) with filetexts(files2, mode="b"): if fmt and blocksize: with pytest.warns(UserWarning): df = dd.read_csv("2014-01-*.csv", compression=fmt, blocksize=blocksize) else: df = dd.read_csv("2014-01-*.csv", compression=fmt, blocksize=blocksize) assert_eq( df.compute(scheduler="sync").reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False, )
def test_complex_delimiter(): longstr = "abc\ndef\n123\n$$$$\ndog\ncat\nfish\n\n\r\n$$$$hello" with filetexts({".test.delim.txt": longstr}): assert read_text(".test.delim.txt", linedelimiter="$$$$").count().compute() == 3 assert ( read_text(".test.delim.txt", linedelimiter="$$$$", blocksize=2) .count() .compute() == 3 ) vals = read_text(".test.delim.txt", linedelimiter="$$$$").compute() assert vals[-1] == "hello" assert vals[0].endswith("$$$$") vals = read_text(".test.delim.txt", linedelimiter="$$$$", blocksize=2).compute() assert vals[-1] == "hello" assert vals[0].endswith("$$$$")
def test_open_files_compression(mode, fmt): if fmt == "zip" and sys.version_info.minor == 5: pytest.skip("zipfile is read-only on py35") if fmt not in compress: pytest.skip("compression function not provided") files2 = valmap(compress[fmt], files) with filetexts(files2, mode="b"): myfiles = open_files(".test.accounts.*", mode=mode, compression=fmt) data = [] for file in myfiles: with file as f: data.append(f.read()) sol = [files[k] for k in sorted(files)] if mode == "rt": sol = [b.decode() for b in sol] assert list(data) == sol
def test_auto_blocksize_csv(monkeypatch): psutil = pytest.importorskip('psutil') try: from unittest import mock except ImportError: mock = pytest.importorskip('mock') total_memory = psutil.virtual_memory().total cpu_count = psutil.cpu_count() mock_read_bytes = mock.Mock(wraps=read_bytes) monkeypatch.setattr(dask.dataframe.io.csv, 'read_bytes', mock_read_bytes) expected_block_size = auto_blocksize(total_memory, cpu_count) with filetexts(csv_files, mode='b'): dd.read_csv('2014-01-01.csv') assert mock_read_bytes.called assert mock_read_bytes.call_args[1]['blocksize'] == expected_block_size
def test_head_partial_line_fix(): files = { ".overflow1.csv": ( "a,b\n0,'abcdefghijklmnopqrstuvwxyz'\n1,'abcdefghijklmnopqrstuvwxyz'" ), ".overflow2.csv": ("a,b\n111111,-11111\n222222,-22222\n333333,-33333\n"), } with filetexts(files): # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code. dd.read_csv(".overflow1.csv", sample=52) # 35 characters is cuts off before the second number on the last line # Should sample to end of line, otherwise pandas will infer `b` to be # a float dtype df = dd.read_csv(".overflow2.csv", sample=35) assert (df.dtypes == "i8").all()
def test_names(): with filetexts(files, mode="b"): _, a = read_bytes(".test.accounts.*") _, b = read_bytes(".test.accounts.*") a = list(concat(a)) b = list(concat(b)) assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) for fn in files: with open(fn, "ab") as f: f.write(b"x") _, c = read_bytes(".test.accounts.*") c = list(concat(c)) assert [aa._key for aa in a] != [cc._key for cc in c]
def test_head_partial_line_fix(): files = {'.overflow1.csv': ('a,b\n' '0,"abcdefghijklmnopqrstuvwxyz"\n' '1,"abcdefghijklmnopqrstuvwxyz"'), '.overflow2.csv': ('a,b\n' '111111,-11111\n' '222222,-22222\n' '333333,-33333\n')} with filetexts(files): # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code. dd.read_csv('.overflow1.csv', sample=52) # 35 characters is cuts off before the second number on the last line # Should sample to end of line, otherwise pandas will infer `b` to be # a float dtype df = dd.read_csv('.overflow2.csv', sample=35) assert (df.dtypes == 'i8').all()
def test_read_text(fmt, bs, encoding): compress = compression.compress[fmt] files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items()) with filetexts(files2, mode='b'): b = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs, encoding=encoding) L, = compute(b) assert ''.join(L) == expected blocks = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs, encoding=encoding, collection=False) L = compute(*blocks) assert ''.join(line for block in L for line in block) == expected
def test_read_csv_compression(fmt, blocksize): if fmt and fmt not in compress: pytest.skip("compress function not provided for %s" % fmt) suffix = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}.get(fmt, "") files2 = valmap(compress[fmt], csv_files) if fmt else csv_files renamed_files = {k + suffix: v for k, v in files2.items()} with filetexts(renamed_files, mode="b"): # This test is using `compression="infer"` (the default) for # read_csv. The paths must have the appropriate extension. if fmt and blocksize: with pytest.warns(UserWarning): df = dd.read_csv("2014-01-*.csv" + suffix, blocksize=blocksize) else: df = dd.read_csv("2014-01-*.csv" + suffix, blocksize=blocksize) assert_eq( df.compute(scheduler="sync").reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False, )
def test_consistent_dtypes_2(): text1 = normalize_text(""" name,amount Alice,100 Bob,-200 Charlie,300 """) text2 = normalize_text(""" name,amount 1,400 2,-500 Frank,600 """) with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}): df = dd.read_csv('foo.*.csv', blocksize=25) assert df.name.dtype == object assert df.name.compute().dtype == object
def test_warn_non_seekable_files(): files2 = valmap(compress["gzip"], csv_files) with filetexts(files2, mode="b"): with pytest.warns(UserWarning) as w: df = dd.read_csv("2014-01-*.csv", compression="gzip") assert df.npartitions == 3 assert len(w) == 1 msg = str(w[0].message) assert "gzip" in msg assert "blocksize=None" in msg with pytest.warns(None) as w: df = dd.read_csv("2014-01-*.csv", compression="gzip", blocksize=None) assert len(w) == 0 with pytest.raises(NotImplementedError): with pytest.warns(UserWarning): # needed for pytest df = dd.read_csv("2014-01-*.csv", compression="foo")
def test_warn_non_seekable_files(): files2 = valmap(compress['gzip'], csv_files) with filetexts(files2, mode='b'): with pytest.warns(UserWarning) as w: df = dd.read_csv('2014-01-*.csv', compression='gzip') assert df.npartitions == 3 assert len(w) == 1 msg = str(w[0].message) assert 'gzip' in msg assert 'blocksize=None' in msg with pytest.warns(None) as w: df = dd.read_csv('2014-01-*.csv', compression='gzip', blocksize=None) assert len(w) == 0 with pytest.raises(NotImplementedError): with pytest.warns(UserWarning): # needed for pytest df = dd.read_csv('2014-01-*.csv', compression='foo')
def test_read_text(fmt, bs, encoding, include_path): if fmt not in utils.compress: pytest.skip("compress function not provided for %s" % fmt) compress = utils.compress[fmt] files2 = {k: compress(v.encode(encoding)) for k, v in files.items()} with filetexts(files2, mode="b"): b = read_text(".test.accounts.*.json", compression=fmt, blocksize=bs, encoding=encoding) (L, ) = compute(b) assert "".join(L) == expected o = read_text( sorted(files), compression=fmt, blocksize=bs, encoding=encoding, include_path=include_path, ) b = o.pluck(0) if include_path else o (L, ) = compute(b) assert "".join(L) == expected if include_path: (paths, ) = compute(o.pluck(1)) expected_paths = list( concat([[k] * v.count("\n") for k, v in files.items()])) assert len(paths) == len(expected_paths) for path, expected_path in zip(paths, expected_paths): assert path.endswith(expected_path) blocks = read_text( ".test.accounts.*.json", compression=fmt, blocksize=bs, encoding=encoding, collection=False, ) L = compute(*blocks) assert "".join(line for block in L for line in block) == expected
def test_read_bytes_delimited(): with filetexts(files, mode="b"): for bs in [5, 15, 45, "1.5 kB"]: _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"\n") _, values2 = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"foo") assert [a.key for a in concat(values)] != [b.key for b in concat(values2)] results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b"\n") for r in res) ourlines = b"".join(res).split(b"\n") testlines = b"".join(files[k] for k in sorted(files)).split(b"\n") assert ourlines == testlines # delimiter not at the end d = b"}" _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b"}") for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_csv_sensitive_to_enforce(): with filetexts(csv_files, mode='b'): a = dd.read_csv('2014-01-*.csv', enforce=True) b = dd.read_csv('2014-01-*.csv', enforce=False) assert a._name != b._name
def test_read_csv_compression(fmt, blocksize): files2 = valmap(compress[fmt], csv_files) with filetexts(files2, mode='b'): df = dd.read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize) assert_eq(df.compute(scheduler='sync').reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False)
def test_read_csv_include_path_column_with_duplicate_name(dd_read, files): with filetexts(files, mode='b'): with pytest.raises(ValueError): dd_read('2014-01-*.csv', include_path_column='name')
def test_read_bytes_blocksize_float_errs(): with filetexts(files, mode="b"): with pytest.raises(TypeError): read_bytes(".test.account*", blocksize=5.5)
def test_read_bytes_no_sample(): with filetexts(files, mode="b"): sample, _ = read_bytes(".test.accounts.1.json", sample=False) assert sample is False
def test_parse_sample_bytes(): with filetexts(files, mode="b"): sample, values = read_bytes(".test.accounts.*", sample="40 B") assert len(sample) == 40
def test_read_csv_include_path_column_with_duplicate_name(dd_read, files): with filetexts(files, mode="b"): with pytest.raises(ValueError): dd_read("2014-01-*.csv", include_path_column="name")
def test_bad_compression(): with filetexts(files, mode="b"): for func in [read_bytes, open_files]: with pytest.raises(ValueError): sample, values = func(".test.accounts.*", compression="not-found")
def test_with_urls(): with filetexts(files, mode="b"): # OS-independent file:// URI with glob * url = to_uri(".test.accounts.") + "*" sample, values = read_bytes(url, blocksize=None) assert sum(map(len, values)) == len(files)
def test_read_bytes_include_path(): with filetexts(files, mode="b"): _, _, paths = read_bytes(".test.accounts.*", include_path=True) assert {os.path.split(path)[1] for path in paths} == set(files.keys())
def test_read_bytes_blocksize_none(): with filetexts(files, mode="b"): sample, values = read_bytes(".test.accounts.*", blocksize=None) assert sum(map(len, values)) == len(files)
def test_multiple_read_csv_has_deterministic_name(): with filetexts({'_foo.1.csv': csv_text, '_foo.2.csv': csv_text}): a = dd.read_csv('_foo.*.csv') b = dd.read_csv('_foo.*.csv') assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
def test_read_csv_no_sample(): with filetexts(csv_files, mode="b") as fn: df = dd.read_csv(fn, sample=False) assert list(df.columns) == ["name", "amount", "id"]