def test_enforce_dtypes(): blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'], [b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40']] head = pd.read_csv(BytesIO(blocks[0][0]), header=0) dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {}, collection=False) dfs = compute(*dfs) assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
def test_blocked(): blocks = [] for k in sorted(files): b = files[k] lines = b.split(b'\n') blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)]) df = read_csv_from_bytes(blocks, header, expected.head(), {}) eq(df.compute().reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False) expected2 = expected[['name', 'id']] df = read_csv_from_bytes(blocks, header, expected2.head(), {'usecols': ['name', 'id']}) eq(df.compute().reset_index(drop=True), expected2.reset_index(drop=True), check_dtype=False)
def test_read_csv_simple(): blocks = [[files[k]] for k in sorted(files)] kwargs = {} head = bytes_read_csv(files['2014-01-01.csv'], b'', {}) df = read_csv_from_bytes(blocks, header, head, kwargs, collection=True) assert isinstance(df, dd.DataFrame) assert list(df.columns) == ['name', 'amount', 'id'] values = read_csv_from_bytes(blocks, header, head, kwargs, collection=False) assert isinstance(values, list) assert len(values) == 3 assert all(hasattr(item, 'dask') for item in values) result = df.amount.sum().compute(get=get_sync) assert result == (100 + 200 + 300 + 400 + 500 + 600)
def test_enforce_dtypes(): blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'], [b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40']] head = pd.read_csv(BytesIO(blocks[0][0]), header=0) dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {}, enforce_dtypes=True, collection=False) dfs = compute(*dfs) assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
def test_enforce_columns(): blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'], [b'AA,bb\n1,1.0\n2.2.0', b'10,20\n30,40']] head = pd.read_csv(BytesIO(blocks[0][0]), header=0) with pytest.raises(ValueError): dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {}, collection=False, enforce=True) compute(*dfs)
def test_kwargs(): blocks = [files[k] for k in sorted(files)] blocks = [[b] for b in blocks] kwargs = {'usecols': ['name', 'id']} head = bytes_read_csv(files['2014-01-01.csv'], b'', kwargs) df = read_csv_from_bytes(blocks, header, head, kwargs, collection=True) assert list(df.columns) == ['name', 'id'] result = df.compute() assert (result.columns == df.columns).all()