def test_from_bcolz(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2]
def test_from_bcolz_no_lock(): bcolz = pytest.importorskip("bcolz") locktype = type(Lock()) t = bcolz.ctable( [[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], chunklen=2 ) a = dd.from_bcolz(t, chunksize=2) b = dd.from_bcolz(t, chunksize=2, lock=True) c = dd.from_bcolz(t, chunksize=2, lock=False) assert_eq(a, b) assert_eq(a, c) assert not any(isinstance(item, locktype) for v in c.dask.values() for item in v)
def test_from_bcolz(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d._known_dtype assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a'] L = list(d.index.compute(get=get_sync)) assert L == [0, 1, 2] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask)) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4., 'b')) t.flush() assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk))
def test_from_bcolz(): bcolz = pytest.importorskip("bcolz") t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes["a"]) assert list(d.x.compute(scheduler="sync")) == [1, 2, 3] assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"] L = list(d.index.compute(scheduler="sync")) assert L == [0, 1, 2] d = dd.from_bcolz(t, chunksize=2, index="x") L = list(d.index.compute(scheduler="sync")) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4.0, "b")) t.flush() assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
def test_from_bcolz(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == \ sorted(dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \ sorted(dd.from_bcolz(t, chunksize=3).dask) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4., 'b')) t.flush() assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \ sorted(dsk)
def test_from_bcolz(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes['a']) assert list(d.x.compute(scheduler='sync')) == [1, 2, 3] assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a'] L = list(d.index.compute(scheduler='sync')) assert L == [0, 1, 2] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(scheduler='sync')) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask)) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4., 'b')) t.flush() assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk))
def test_from_bcolz_no_lock(): bcolz = pytest.importorskip('bcolz') locktype = type(Lock()) t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a'], chunklen=2) a = dd.from_bcolz(t, chunksize=2) b = dd.from_bcolz(t, chunksize=2, lock=True) c = dd.from_bcolz(t, chunksize=2, lock=False) assert_eq(a, b) assert_eq(a, c) assert not any(isinstance(item, locktype) for v in c.dask.values() for item in v)
def test_from_bcolz_no_lock(): bcolz = pytest.importorskip('bcolz') locktype = type(Lock()) t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a'], chunklen=2) a = dd.from_bcolz(t, chunksize=2) b = dd.from_bcolz(t, chunksize=2, lock=True) c = dd.from_bcolz(t, chunksize=2, lock=False) eq(a, b) eq(a, c) assert not any( isinstance(item, locktype) for v in c.dask.values() for item in v)
def test_from_bcolz_column_order(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) df = dd.from_bcolz(t, chunksize=2) assert list(df.loc[0].compute().columns) == ['x', 'y', 'a']
def test_from_bcolz_column_order(): bcolz = pytest.importorskip("bcolz") t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) df = dd.from_bcolz(t, chunksize=2) assert list(df.loc[0].compute().columns) == ["x", "y", "a"]
def test_from_bcolz(): try: import bcolz except ImportError: return t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=dask.get)) == [1, 2, 3] assert list(d.a.compute(get=dask.get)) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') assert list(d.index.compute()) == [1, 2, 3]
def test_from_bcolz_filename(): bcolz = pytest.importorskip("bcolz") with tmpfile(".bcolz") as fn: t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], rootdir=fn) t.flush() d = dd.from_bcolz(fn, chunksize=2) assert list(d.x.compute()) == [1, 2, 3]
def test_from_bcolz_filename(): bcolz = pytest.importorskip('bcolz') with tmpfile('.bcolz') as fn: t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a'], rootdir=fn) t.flush() d = dd.from_bcolz(fn, chunksize=2) assert list(d.x.compute()) == [1, 2, 3]
def test_from_bcolz_filename(): try: import bcolz except ImportError: return with tmpfile('.bcolz') as fn: t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a'], rootdir=fn) t.flush() d = dd.from_bcolz(fn, chunksize=2) assert list(d.x.compute()) == [1, 2, 3]
def test_from_bcolz_filename(): bcolz = pytest.importorskip("bcolz") with tmpfile(".bcolz") as fn: t = bcolz.ctable( [[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], rootdir=fn, ) t.flush() d = dd.from_bcolz(fn, chunksize=2) assert list(d.x.compute()) == [1, 2, 3]
def test_from_bcolz(): bcolz = pytest.importorskip("bcolz") t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes["a"]) == "category" assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ["a", "b", "a"] d = dd.from_bcolz(t, chunksize=2, index="x") L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4.0, "b")) t.flush() assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
def check(i): t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes["a"]) assert list(d.x.compute(scheduler="sync")) == [1, 2, 3] assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"] d = dd.from_bcolz(t, chunksize=2, index="x") L = list(d.index.compute(scheduler="sync")) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask)
def check(): t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask))
def check(): t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert str(d.dtypes['a']) == 'category' assert list(d.x.compute(get=get_sync)) == [1, 2, 3] assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(get=get_sync)) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == \ sorted(dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \ sorted(dd.from_bcolz(t, chunksize=3).dask)
def check(i): t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes['a']) assert list(d.x.compute(scheduler='sync')) == [1, 2, 3] assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(scheduler='sync')) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask))
if len(sys.argv)>5: p.y = sys.argv[5] if len(sys.argv)>6: p.categories = sys.argv[6:] from dask.cache import Cache Cache(p.cachesize).register() filetypes_storing_categories = {'parq','castra'} read = odict([(f,odict()) for f in ["parq","bcolz","feather","castra","h5","csv"]]) read["csv"] ["dask"] = lambda filepath,p: dd.read_csv(filepath, usecols=p.columns) read["h5"] ["dask"] = lambda filepath,p: dd.read_hdf(filepath, p.base, chunksize=p.chunksize, columns=p.columns) read["castra"] ["dask"] = lambda filepath,p: dd.from_castra(filepath) read["bcolz"] ["dask"] = lambda filepath,p: dd.from_bcolz(filepath, chunksize=1000000) read["parq"] ["dask"] = lambda filepath,p: dd.io.parquet.read_parquet(filepath,index=False, categories=p.categories, columns=p.columns) read["csv"] ["pandas"] = lambda filepath,p: pd.read_csv(filepath, usecols=p.columns) read["h5"] ["pandas"] = lambda filepath,p: pd.read_hdf(filepath, p.base, columns=p.columns) read["feather"] ["pandas"] = lambda filepath,p: feather.read_dataframe(filepath) read["parq"] ["pandas"] = lambda filepath,p: fp.ParquetFile(filepath).to_pandas() write = odict([(f,odict()) for f in ["parq","snappy.parq","gz.parq","bcolz","feather","castra","h5","csv"]]) write["csv"] ["dask"] = lambda df,filepath,p: df.to_csv(filepath.replace(".csv","*.csv")) write["h5"] ["dask"] = lambda df,filepath,p: df.to_hdf(filepath, p.base) write["castra"] ["dask"] = lambda df,filepath,p: df.to_castra(filepath,categories=p.categories) write["parq"] ["dask"] = lambda df,filepath,p: dd.io.parquet.to_parquet(filepath, df) ## **p.parq_opts write["snappy.parq"] ["dask"] = lambda df,filepath,p: dd.io.parquet.to_parquet(filepath, df, compression='SNAPPY') ## **p.parq_opts