def test_head_partial_line_fix(): with filetexts({ '.overflow.csv': 'a,b\n0,"abcdefghijklmnopqrstuvwxyz"\n1,"abcdefghijklmnopqrstuvwxyz"' }): # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code. read_csv('.overflow.csv', sample=52)
def test_read_csv_files(): with filetexts(files, mode='b'): df = read_csv('2014-01-*.csv') eq(df, expected, check_dtype=False) fn = '2014-01-01.csv' df = read_csv(fn) expected2 = pd.read_csv(BytesIO(files[fn])) eq(df, expected2, check_dtype=False)
def test_warn_non_seekable_files(capsys): files2 = valmap(compress['gzip'], files) with filetexts(files2, mode='b'): df = read_csv('2014-01-*.csv', compression='gzip') assert df.npartitions == 3 out, err = capsys.readouterr() assert 'gzip' in err assert 'blocksize=None' in err df = read_csv('2014-01-*.csv', compression='gzip', blocksize=None) out, err = capsys.readouterr() assert not err and not out with pytest.raises(NotImplementedError): df = read_csv('2014-01-*.csv', compression='foo')
def test_late_dtypes(): text = 'a,b\n1,2\n2,3\n3,4\n4,5\n5.5,6\n6,7.5' with filetext(text) as fn: df = read_csv(fn, blocksize=5, sample=10) try: df.b.sum().compute() assert False except TypeError as e: assert ("'b': float" in str(e) or "'a': float" in str(e)) df = read_csv(fn, blocksize=5, sample=10, dtype={'a': float, 'b': float}) assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5.5 + 6 assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7.5
def test_auto_blocksize_csv(monkeypatch): psutil = pytest.importorskip('psutil') try: from unittest import mock except ImportError: mock = pytest.importorskip('mock') total_memory = psutil.virtual_memory().total cpu_count = psutil.cpu_count() mock_read_bytes = mock.Mock(wraps=read_bytes) monkeypatch.setattr(dask.dataframe.csv, 'read_bytes', mock_read_bytes) expected_block_size = auto_blocksize(total_memory, cpu_count) with filetexts(files, mode='b'): read_csv('2014-01-01.csv') assert mock_read_bytes.called assert mock_read_bytes.call_args[1]['blocksize'] == expected_block_size
def test_read_csv_compression(fmt, blocksize): files2 = valmap(compress[fmt], files) with filetexts(files2, mode='b'): df = read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize) eq(df.compute(get=get_sync).reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False)
def test_header_None(): with filetexts({'.tmp.1.csv': '1,2', '.tmp.2.csv': '', '.tmp.3.csv': '3,4'}): df = read_csv('.tmp.*.csv', header=None) expected = pd.DataFrame({0: [1, 3], 1: [2, 4]}) eq(df.compute().reset_index(drop=True), expected)
def test_head_partial_line_fix(): files = {'.overflow1.csv': ('a,b\n' '0,"abcdefghijklmnopqrstuvwxyz"\n' '1,"abcdefghijklmnopqrstuvwxyz"'), '.overflow2.csv': ('a,b\n' '111111,-11111\n' '222222,-22222\n' '333333,-33333\n')} with filetexts(files): # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code. read_csv('.overflow1.csv', sample=52) # 35 characters is cuts off before the second number on the last line # Should sample to end of line, otherwise pandas will infer `b` to be # a float dtype df = read_csv('.overflow2.csv', sample=35) assert (df.dtypes == 'i8').all()
def test_windows_line_terminator(): text = 'a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7' with filetext(text) as fn: df = read_csv(fn, blocksize=5, lineterminator='\r\n') assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7 assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
def test_read_csv_sensitive_to_enforce(): with filetexts(files, mode='b'): a = read_csv('2014-01-*.csv', enforce=True) b = read_csv('2014-01-*.csv', enforce=False) assert a._name != b._name
def test_head_partial_line_fix(): with filetexts({'.overflow.csv': 'a,b\n0,"abcdefghijklmnopqrstuvwxyz"\n1,"abcdefghijklmnopqrstuvwxyz"'}): # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code. read_csv('.overflow.csv', sample=52)