Exemplo n.º 1
0
def test_head_partial_line_fix():
    with filetexts({
            '.overflow.csv':
            'a,b\n0,"abcdefghijklmnopqrstuvwxyz"\n1,"abcdefghijklmnopqrstuvwxyz"'
    }):
        # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code.
        read_csv('.overflow.csv', sample=52)
Exemplo n.º 2
0
def test_read_csv_files():
    with filetexts(files, mode='b'):
        df = read_csv('2014-01-*.csv')
        eq(df, expected, check_dtype=False)

        fn = '2014-01-01.csv'
        df = read_csv(fn)
        expected2 = pd.read_csv(BytesIO(files[fn]))
        eq(df, expected2, check_dtype=False)
Exemplo n.º 3
0
def test_warn_non_seekable_files(capsys):
    files2 = valmap(compress['gzip'], files)
    with filetexts(files2, mode='b'):
        df = read_csv('2014-01-*.csv', compression='gzip')
        assert df.npartitions == 3
        out, err = capsys.readouterr()
        assert 'gzip' in err
        assert 'blocksize=None' in err

        df = read_csv('2014-01-*.csv', compression='gzip', blocksize=None)
        out, err = capsys.readouterr()
        assert not err and not out

        with pytest.raises(NotImplementedError):
            df = read_csv('2014-01-*.csv', compression='foo')
Exemplo n.º 4
0
def test_late_dtypes():
    text = 'a,b\n1,2\n2,3\n3,4\n4,5\n5.5,6\n6,7.5'
    with filetext(text) as fn:
        df = read_csv(fn, blocksize=5, sample=10)
        try:
            df.b.sum().compute()
            assert False
        except TypeError as e:
            assert ("'b': float" in str(e) or
                    "'a': float" in str(e))

        df = read_csv(fn, blocksize=5, sample=10,
                      dtype={'a': float, 'b': float})

        assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5.5 + 6
        assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7.5
Exemplo n.º 5
0
def test_auto_blocksize_csv(monkeypatch):
    psutil = pytest.importorskip('psutil')
    try:
        from unittest import mock
    except ImportError:
        mock = pytest.importorskip('mock')
    total_memory = psutil.virtual_memory().total
    cpu_count = psutil.cpu_count()
    mock_read_bytes = mock.Mock(wraps=read_bytes)
    monkeypatch.setattr(dask.dataframe.csv, 'read_bytes', mock_read_bytes)

    expected_block_size = auto_blocksize(total_memory, cpu_count)
    with filetexts(files, mode='b'):
        read_csv('2014-01-01.csv')
        assert mock_read_bytes.called
        assert mock_read_bytes.call_args[1]['blocksize'] == expected_block_size
Exemplo n.º 6
0
def test_read_csv_compression(fmt, blocksize):
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode='b'):
        df = read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize)
        eq(df.compute(get=get_sync).reset_index(drop=True),
           expected.reset_index(drop=True),
           check_dtype=False)
Exemplo n.º 7
0
def test_header_None():
    with filetexts({'.tmp.1.csv': '1,2',
                    '.tmp.2.csv': '',
                    '.tmp.3.csv': '3,4'}):
        df = read_csv('.tmp.*.csv', header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        eq(df.compute().reset_index(drop=True), expected)
Exemplo n.º 8
0
def test_header_None():
    with filetexts({'.tmp.1.csv': '1,2',
                    '.tmp.2.csv': '',
                    '.tmp.3.csv': '3,4'}):
        df = read_csv('.tmp.*.csv', header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        eq(df.compute().reset_index(drop=True), expected)
Exemplo n.º 9
0
def test_auto_blocksize_csv(monkeypatch):
    psutil = pytest.importorskip('psutil')
    try:
        from unittest import mock
    except ImportError:
        mock = pytest.importorskip('mock')
    total_memory = psutil.virtual_memory().total
    cpu_count = psutil.cpu_count()
    mock_read_bytes = mock.Mock(wraps=read_bytes)
    monkeypatch.setattr(dask.dataframe.csv, 'read_bytes', mock_read_bytes)

    expected_block_size = auto_blocksize(total_memory, cpu_count)
    with filetexts(files, mode='b'):
        read_csv('2014-01-01.csv')
        assert mock_read_bytes.called
        assert mock_read_bytes.call_args[1]['blocksize'] == expected_block_size
Exemplo n.º 10
0
def test_head_partial_line_fix():
    files = {'.overflow1.csv': ('a,b\n'
                                '0,"abcdefghijklmnopqrstuvwxyz"\n'
                                '1,"abcdefghijklmnopqrstuvwxyz"'),
             '.overflow2.csv': ('a,b\n'
                                '111111,-11111\n'
                                '222222,-22222\n'
                                '333333,-33333\n')}
    with filetexts(files):
        # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code.
        read_csv('.overflow1.csv', sample=52)

        # 35 characters is cuts off before the second number on the last line
        # Should sample to end of line, otherwise pandas will infer `b` to be
        # a float dtype
        df = read_csv('.overflow2.csv', sample=35)
        assert (df.dtypes == 'i8').all()
Exemplo n.º 11
0
def test_windows_line_terminator():
    text = 'a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7'
    with filetext(text) as fn:
        df = read_csv(fn, blocksize=5, lineterminator='\r\n')
        assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7
        assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
Exemplo n.º 12
0
def test_read_csv_compression(fmt, blocksize):
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode='b'):
        df = read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize)
        eq(df.compute(get=get_sync).reset_index(drop=True),
           expected.reset_index(drop=True), check_dtype=False)
Exemplo n.º 13
0
def test_read_csv_sensitive_to_enforce():
    with filetexts(files, mode='b'):
        a = read_csv('2014-01-*.csv', enforce=True)
        b = read_csv('2014-01-*.csv', enforce=False)
        assert a._name != b._name
Exemplo n.º 14
0
def test_read_csv_sensitive_to_enforce():
    with filetexts(files, mode='b'):
        a = read_csv('2014-01-*.csv', enforce=True)
        b = read_csv('2014-01-*.csv', enforce=False)
        assert a._name != b._name
Exemplo n.º 15
0
def test_head_partial_line_fix():
    with filetexts({'.overflow.csv': 'a,b\n0,"abcdefghijklmnopqrstuvwxyz"\n1,"abcdefghijklmnopqrstuvwxyz"'}):
        # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code.
        read_csv('.overflow.csv', sample=52)