def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [ myadd(i, j, 10) for (i, j) in zip(x, x2) ]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]) # check that map works with multiarg functions. Can be removed after # deprecated behavior is removed assert b.map(add, b2).compute() == list(map(add, x, x2)) # check that map works with vararg functions. Can be removed after # deprecated behavior is removed def vararg_inc(*args): return inc(*args) assert_eq(b.map(vararg_inc), list(map(inc, x)))
def test_join(transform): other = transform([1, 2, 3]) c = b.join(other, on_self=isodd, on_other=iseven) assert_eq(c, list(join(iseven, [1, 2, 3], isodd, list(b)))) assert_eq(b.join(other, isodd), list(join(isodd, [1, 2, 3], isodd, list(b)))) assert c.name == b.join(other, on_self=isodd, on_other=iseven).name
def test_fold_bag(): def binop(tot, x): tot.add(x) return tot c = b.fold(binop, combine=set.union, initial=set(), out_type=Bag) assert isinstance(c, Bag) assert_eq(c, list(set(range(5))))
def test_repartition_npartitions(nin, nout): b = db.from_sequence(range(100), npartitions=nin) c = b.repartition(npartitions=nout) assert c.npartitions == nout assert_eq(b, c) results = dask.get(c.dask, c.__dask_keys__()) assert all(results)
def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [myadd(i, j, 10) for (i, j) in zip(x, x2)]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]) # check that map works with multiarg functions. Can be removed after # deprecated behavior is removed assert b.map(add, b2).compute() == list(map(add, x, x2)) # check that map works with vararg functions. Can be removed after # deprecated behavior is removed def vararg_inc(*args): return inc(*args) assert_eq(b.map(vararg_inc), list(map(inc, x)))
def test_bagged_array_delayed(): da = pytest.importorskip("dask.array") obj = da.ones(10, chunks=5).to_delayed()[0] bag = db.from_delayed(obj) b = bag.compute() assert_eq(b, [1.0, 1.0, 1.0, 1.0, 1.0])
def test_non_splittable_reductions(npartitions): np = pytest.importorskip('numpy') data = list(range(100)) c = db.from_sequence(data, npartitions=npartitions) assert_eq(c.mean(), np.mean(data)) assert_eq(c.std(), np.std(data))
def test_non_splittable_reductions(npartitions): np = pytest.importorskip("numpy") data = list(range(100)) c = db.from_sequence(data, npartitions=npartitions) assert_eq(c.mean(), np.mean(data)) assert_eq(c.std(), np.std(data))
def test_reduction_empty_aggregate(npartitions): b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None) assert_eq(b.min(split_every=2), 1) vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler='sync') assert vals == (1, 1) with pytest.raises(ValueError): b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions) b.filter(None).min(split_every=2).compute(scheduler='sync')
def test_repartition(nin, nout): b = db.from_sequence(range(100), npartitions=nin) c = b.repartition(npartitions=nout) assert c.npartitions == nout assert_eq(b, c) results = dask.get(c.dask, c.__dask_keys__()) assert all(results)
def test_reduction_empty_aggregate(npartitions): b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None) assert_eq(b.min(split_every=2), 1) vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler="sync") assert vals == (1, 1) with pytest.raises(ValueError): b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions) b.filter(None).min(split_every=2).compute(scheduler="sync")
def test_map_partitions_arg(): def append_str(partition, s): return [x + s for x in partition] mybag = db.from_sequence(["a", "b", "c"]) assert_eq(mybag.map_partitions(append_str, "foo"), ['afoo', 'bfoo', 'cfoo']) assert_eq(mybag.map_partitions(append_str, dask.delayed("foo")), ['afoo', 'bfoo', 'cfoo'])
def test_map_partitions_arg(): def append_str(partition, s): return [x + s for x in partition] mybag = db.from_sequence(["a", "b", "c"]) assert_eq(mybag.map_partitions(append_str, "foo"), ["afoo", "bfoo", "cfoo"]) assert_eq(mybag.map_partitions(append_str, dask.delayed("foo")), ["afoo", "bfoo", "cfoo"])
def test_repartition_partition_size_complex_dtypes(): np = pytest.importorskip("numpy") b = db.from_sequence([np.array(range(100)) for _ in range(4)], npartitions=1) total_mem = sum(b.map_partitions(total_mem_usage).compute()) new_partition_size = total_mem // 4 c = b.repartition(partition_size=new_partition_size) assert c.npartitions >= 4 assert_eq(b, c)
def test_multiple_repartition_partition_size(): b = db.from_sequence(range(1, 100), npartitions=1) total_mem = sum(b.map_partitions(total_mem_usage).compute()) c = b.repartition(partition_size=(total_mem // 2)) assert c.npartitions >= 2 assert_eq(b, c) d = c.repartition(partition_size=(total_mem // 5)) assert d.npartitions >= 5 assert_eq(c, d)
def test_frequencies(): c = b.frequencies() assert dict(c) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3} c2 = b.frequencies(split_every=2) assert dict(c2) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3} assert c.name == b.frequencies().name assert c.name != c2.name assert c2.name == b.frequencies(split_every=2).name # test bag with empty partitions b2 = db.from_sequence(range(20), partition_size=2) b2 = b2.filter(lambda x: x < 10) d = b2.frequencies() assert dict(d) == dict(zip(range(10), [1] * 10)) bag = db.from_sequence([0, 0, 0, 0], npartitions=4) bag2 = bag.filter(None).frequencies(split_every=2) assert_eq(bag2, [])
def test_reductions_are_lazy(): current = [None] def part(): for i in range(10): current[0] = i yield i def func(part): assert current[0] == 0 return sum(part) b = Bag({("foo", 0): part()}, "foo", 1) res = b.reduction(func, sum) assert_eq(res, sum(range(10)))
def test_reductions_are_lazy(): current = [None] def part(): for i in range(10): current[0] = i yield i def func(part): assert current[0] == 0 return sum(part) b = Bag({('foo', 0): part()}, 'foo', 1) res = b.reduction(func, sum) assert_eq(res, sum(range(10)))
def test_var(): assert_eq(b.var(), 2.0) assert float(b.var()) == 2.0
def test_empty_bag(): b = db.from_sequence([]) assert_eq(b.map(inc).all(), True) assert_eq(b.map(inc).any(), False) assert_eq(b.map(inc).sum(), False) assert_eq(b.map(inc).count(), False)
def test_bag_with_single_callable(): f = lambda: None b = db.from_sequence([f]) assert_eq(b, [f])
def test_reduction_with_non_comparable_objects(): b = db.from_sequence([StrictReal(x) for x in range(10)], partition_size=2) assert_eq(b.fold(max, max), StrictReal(9))
def test_std(): assert_eq(b.std(), math.sqrt(2.0)) assert float(b.std()) == math.sqrt(2.0)
def test_repartition_partition_size(nin, nout): b = db.from_sequence(range(1, 100), npartitions=nin) total_mem = sum(b.map_partitions(total_mem_usage).compute()) c = b.repartition(partition_size=(total_mem // nout)) assert c.npartitions >= nout assert_eq(b, c)
def test_reduction_empty(): b = db.from_sequence(range(10), npartitions=100) assert_eq(b.filter(lambda x: x % 2 == 0).max(), 8) assert_eq(b.filter(lambda x: x % 2 == 0).min(), 0)
def test_map_partitions_args_kwargs(): x = [random.randint(-100, 100) for i in range(100)] y = [random.randint(-100, 100) for i in range(100)] dx = db.from_sequence(x, npartitions=10) dy = db.from_sequence(y, npartitions=10) def maximum(x, y=0): y = repeat(y) if isinstance(y, int) else y return [max(a, b) for (a, b) in zip(x, y)] sol = maximum(x, y=10) assert_eq(db.map_partitions(maximum, dx, y=10), sol) assert_eq(dx.map_partitions(maximum, y=10), sol) assert_eq(dx.map_partitions(maximum, 10), sol) sol = maximum(x, y) assert_eq(db.map_partitions(maximum, dx, dy), sol) assert_eq(dx.map_partitions(maximum, y=dy), sol) assert_eq(dx.map_partitions(maximum, dy), sol) dy_mean = dy.mean().apply(int) sol = maximum(x, int(sum(y) / len(y))) assert_eq(dx.map_partitions(maximum, y=dy_mean), sol) assert_eq(dx.map_partitions(maximum, dy_mean), sol) dy_mean = dask.delayed(dy_mean) assert_eq(dx.map_partitions(maximum, y=dy_mean), sol) assert_eq(dx.map_partitions(maximum, dy_mean), sol)
def test_distinct_with_key(): seq = [{"a": i} for i in [0, 1, 2, 1, 2, 3, 2, 3, 4, 5]] bag = db.from_sequence(seq, npartitions=3) expected = list(unique(seq, key=lambda x: x["a"])) assert_eq(bag.distinct(key="a"), expected) assert_eq(bag.distinct(key=lambda x: x["a"]), expected)
def test_aggregation(npartitions): L = list(range(15)) b = db.range(15, npartitions=npartitions) assert_eq(b.mean(), sum(L) / len(L)) assert_eq(b.sum(), sum(L)) assert_eq(b.count(), len(L))
def test_bag_map(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a=1, b=2, c=3): return a + b + c assert_eq(db.map(myadd, b), list(map(myadd, x))) assert_eq(db.map(myadd, a=b), list(map(myadd, x))) assert_eq(db.map(myadd, b, b2), list(map(myadd, x, x2))) assert_eq(db.map(myadd, b, 10), [myadd(i, 10) for i in x]) assert_eq(db.map(myadd, 10, b=b), [myadd(10, b=i) for i in x]) sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)] assert_eq(db.map(myadd, b, b=b2, c=100), sol) sol = [myadd(i, c=100) for (i, j) in zip(x, x2)] assert_eq(db.map(myadd, b, c=100), sol) x_sum = sum(x) sol = [myadd(x_sum, b=i, c=100) for i in x2] assert_eq(db.map(myadd, b.sum(), b=b2, c=100), sol) sol = [myadd(i, b=x_sum, c=100) for i in x2] assert_eq(db.map(myadd, b2, b.sum(), c=100), sol) sol = [myadd(a=100, b=x_sum, c=i) for i in x2] assert_eq(db.map(myadd, a=100, b=b.sum(), c=b2), sol) a = dask.delayed(10) assert_eq(db.map(myadd, b, a), [myadd(i, 10) for i in x]) assert_eq(db.map(myadd, b, b=a), [myadd(i, b=10) for i in x]) # Mispatched npartitions fewer_parts = db.from_sequence(range(100), npartitions=5) with pytest.raises(ValueError): db.map(myadd, b, fewer_parts) # No bags with pytest.raises(ValueError): db.map(myadd, b.sum(), 1, 2) # Unequal partitioning unequal = db.from_sequence(range(110), npartitions=10) with pytest.raises(ValueError): db.map(myadd, b, unequal, c=b2).compute() with pytest.raises(ValueError): db.map(myadd, b, b=unequal, c=b2).compute()