Пример #1
0
def test_array(setup_cluster):
    import dask.array as da
    from numpy.core.numeric import array_equal

    x = da.random.random((10000, 10000), chunks=(1000, 1000))
    y = x + x.T
    z = y[::2, 5000:].mean(axis=1)

    dask_res = z.compute()
    assert array_equal(dask_res, z.compute(scheduler=mars_scheduler))
    assert array_equal(dask_res, convert_dask_collection(z).execute().fetch())
Пример #2
0
def test_bag(setup_cluster):
    import dask

    b = dask.datasets.make_people()  # Make records of people
    result = (b.filter(lambda record: record["age"] > 30).map(
        lambda record: record["occupation"]).frequencies(sort=True).topk(
            10, key=1))

    dask_res = result.compute()
    assert dask_res == result.compute(scheduler=mars_scheduler)
    assert dask_res == convert_dask_collection(result).execute().fetch()
Пример #3
0
def test_unpartitioned_dataframe(setup_cluster):
    from dask import dataframe as dd
    from pandas._testing import assert_frame_equal
    import pandas as pd
    from sklearn.datasets import load_boston

    boston = load_boston()
    pd.DataFrame(
        boston.data,
        columns=boston['feature_names']).to_csv("./boston_housing_data.csv")

    df = dd.read_csv(r"./boston_housing_data.csv")
    df["CRIM"] = df["CRIM"] / 2

    dask_res = df.compute()
    assert_frame_equal(dask_res, df.compute(scheduler=mars_scheduler))
    assert_frame_equal(dask_res, convert_dask_collection(df).execute().fetch())
Пример #4
0
def test_partitioned_dataframe(setup_cluster):
    import numpy as np
    import pandas as pd
    from dask import dataframe as dd
    from pandas._testing import assert_frame_equal

    data = np.random.randn(10000, 100)
    df = dd.from_pandas(pd.DataFrame(data,
                                     columns=[f"col{i}" for i in range(100)]),
                        npartitions=4)
    df["col0"] = df["col0"] + df["col1"] / 2
    col2_mean = df["col2"].mean()
    df = df[df["col2"] > col2_mean]

    dask_res = df.compute()
    assert_frame_equal(dask_res,
                       df.compute(scheduler=mars_scheduler),
                       check_index_type=False)
    assert_frame_equal(dask_res,
                       convert_dask_collection(df).execute().fetch(),
                       check_index_type=False)
Пример #5
0
def test_delayed(setup_cluster):
    from dask import delayed
    import numpy as np

    def calc_chunk(n: int, i: int):
        rs = np.random.RandomState(i)
        a = rs.uniform(-1, 1, size=(n, 2))
        d = np.linalg.norm(a, axis=1)
        return (d < 1).sum()

    def calc_pi(fs, N):
        return sum(fs) * 4 / N

    N = 200_000_000
    n = 10_000_000

    fs = [delayed(calc_chunk)(n, i) for i in range(N // n)]
    pi = delayed(calc_pi)(fs, N)

    dask_res = pi.compute()
    assert dask_res == pi.compute(scheduler=mars_scheduler)
    assert dask_res == convert_dask_collection(pi).execute().fetch()
Пример #6
0
def test_dask_errors():
    with pytest.raises(TypeError):
        convert_dask_collection({"foo": 0, "bar": 1})