def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [myadd(i, j, 10) for (i, j) in zip(x, x2)]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]) # check that map works with multiarg functions. Can be removed after # deprecated behavior is removed assert b.map(add, b2).compute() == list(map(add, x, x2)) # check that map works with vararg functions. Can be removed after # deprecated behavior is removed def vararg_inc(*args): return inc(*args) assert b.map(vararg_inc).compute(get=dask.get) == list(map(inc, x))
def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [ myadd(i, j, 10) for (i, j) in zip(x, x2) ]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]) # check that map works with multiarg functions. Can be removed after # deprecated behavior is removed assert b.map(add, b2).compute() == list(map(add, x, x2)) # check that map works with vararg functions. Can be removed after # deprecated behavior is removed def vararg_inc(*args): return inc(*args) assert b.map(vararg_inc).compute(get=dask.get) == list(map(inc, x))
def test_from_filenames_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.from_filenames([fn], chunkbytes=100) assert list(b) == list(d)
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b.str.strip())) == list(map(str, c.str.strip())) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def test_read_text_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.read_text(fn, blocksize=100, encoding='gb18030') c = db.read_text(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.read_text([fn], blocksize=100, encoding='gb18030') assert list(b) == list(d)
def test_from_filenames_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.from_filenames(fn, chunkbytes=100, encoding='gb18030') c = db.from_filenames(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.from_filenames([fn], chunkbytes=100, encoding='gb18030') assert list(b) == list(d)
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def test_bag_map(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a=1, b=2, c=3): return a + b + c assert db.map(myadd, b).compute() == list(map(myadd, x)) assert db.map(myadd, a=b).compute() == list(map(myadd, x)) assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2)) assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x] sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, b=b2, c=100).compute() == sol sol = [myadd(i, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, c=100).compute() == sol x_sum = sum(x) sol = [myadd(x_sum, b=i, c=100) for i in x2] assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol sol = [myadd(i, b=x_sum, c=100) for i in x2] assert db.map(myadd, b2, b.sum(), c=100).compute() == sol sol = [myadd(a=100, b=x_sum, c=i) for i in x2] assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol a = dask.delayed(10) assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x] # Mispatched npartitions fewer_parts = db.from_sequence(range(100), npartitions=5) with pytest.raises(ValueError): db.map(myadd, b, fewer_parts) # No bags with pytest.raises(ValueError): db.map(myadd, b.sum(), 1, 2) # Unequal partitioning unequal = db.from_sequence(range(110), npartitions=10) with pytest.raises(ValueError): db.map(myadd, b, unequal, c=b2).compute() with pytest.raises(ValueError): db.map(myadd, b, b=unequal, c=b2).compute()
def test_map_keynames(): b = db.from_sequence([1, 2, 3]) d = dict(b.map(inc).__dask_graph__()) assert "inc" in map(dask.utils.key_split, d) assert set(b.map(inc).__dask_graph__()) != set( b.map_partitions(inc).__dask_graph__())
def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [myadd(i, j, 10) for (i, j) in zip(x, x2)]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x])
def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert b.map(myadd, b2, c=10).compute() == [myadd(i, j, 10) for (i, j) in zip(x, x2)] x_sum = sum(x) assert b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]
def test_from_castra(): castra = pytest.importorskip('castra') pd = pytest.importorskip('pandas') dd = pytest.importorskip('dask.dataframe') df = pd.DataFrame({'x': list(range(100)), 'y': [str(i) for i in range(100)]}) a = dd.from_pandas(df, 10) c = a.to_castra() default = db.from_castra(c) with_columns = db.from_castra(c, 'x') with_index = db.from_castra(c, 'x', index=True) with_nparts = db.from_castra(c, 'x', npartitions=4) try: assert list(default) == list(zip(range(100), map(str, range(100)))) assert list(with_columns) == list(range(100)) assert list(with_index) == list(zip(range(100), range(100))) assert with_nparts.npartitions == 4 assert list(with_nparts) == list(range(100)) finally: c.drop()
def test_from_castra(): castra = pytest.importorskip('castra') pd = pytest.importorskip('pandas') dd = pytest.importorskip('dask.dataframe') df = pd.DataFrame({ 'x': list(range(100)), 'y': [str(i) for i in range(100)] }) a = dd.from_pandas(df, 10) c = a.to_castra() default = db.from_castra(c) with_columns = db.from_castra(c, 'x') with_index = db.from_castra(c, 'x', index=True) with_nparts = db.from_castra(c, 'x', npartitions=4) try: assert list(default) == list(zip(range(100), map(str, range(100)))) assert list(with_columns) == list(range(100)) assert list(with_index) == list(zip(range(100), range(100))) assert with_nparts.npartitions == 4 assert list(with_nparts) == list(range(100)) finally: c.drop()
def test_map(): c = b.map(inc) assert c.compute() == list(map(inc, b.compute())) assert c.name == b.map(inc).name
def test_bz2_stream(): text = '\n'.join(map(str, range(10000))) compressed = bz2.compress(text.encode()) assert (list(take(100, bz2_stream(compressed))) == list(map(lambda x: str(x) + '\n', range(100))))
def test_map_is_lazy(): from dask.bag.core import map assert isinstance(map(lambda x: x, [1, 2, 3]), Iterator)
def test_map_keynames(): b = db.from_sequence([1, 2, 3]) d = dict(b.map(inc).__dask_graph__()) assert 'inc' in map(dask.utils.key_split, d) assert set(b.map(inc).__dask_graph__()) != set(b.map_partitions(inc).__dask_graph__())