예제 #1
0
def test_set_index_does_not_repeat_work_due_to_optimizations(npartitions):
    # Atomic counter
    count = itertools.count()

    def increment():
        next(count)

    def make_part(dummy, n):
        return pd.DataFrame({
            "x": np.random.random(n),
            "y": np.random.random(n)
        })

    nbytes = 1e6
    nparts = 50
    n = int(nbytes / (nparts * 8))

    dsk = {("inc", i): (increment, ) for i in range(nparts)}
    dsk.update({("x", i): (make_part, ("inc", i), n) for i in range(nparts)})
    ddf = dd.DataFrame(dsk, "x", make_part(None, 1), [None] * (nparts + 1))

    ddf.set_index("x", npartitions=npartitions)
    ntimes = next(count)
    assert ntimes == nparts
예제 #2
0
def test_set_index():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 2, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 5, 8]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [9, 1, 8]
        },
                               index=[9, 9, 9])
    }
    d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    full = d.compute()

    d2 = d.set_index('b', npartitions=3)
    assert d2.npartitions == 3
    # assert eq(d2, full.set_index('b').sort())
    assert str(d2.compute().sort(['a'
                                  ])) == str(full.set_index('b').sort(['a']))

    d3 = d.set_index(d.b, npartitions=3)
    assert d3.npartitions == 3
    # assert eq(d3, full.set_index(full.b).sort())
    assert str(d3.compute().sort(['a'])) == str(
        full.set_index(full.b).sort(['a']))

    d2 = d.set_index('b')
    assert str(d2.compute().sort(['a'
                                  ])) == str(full.set_index('b').sort(['a']))
예제 #3
0
def test_set_index(engine):
    if engine == "cudf":
        # NOTE: engine == "cudf" requires cudf/dask_cudf,
        # will be skipped by non-GPU CI.

        dask_cudf = pytest.importorskip("dask_cudf")

    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 2, 6]}, index=[0, 1, 3]),
        ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 5, 8]}, index=[5, 6, 8]),
        ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [9, 1, 8]}, index=[9, 9, 9]),
    }
    d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])

    if engine == "cudf":
        d = dask_cudf.from_dask_dataframe(d)

    full = d.compute()

    d2 = d.set_index("b", npartitions=3)
    assert d2.npartitions == 3
    assert d2.index.name == "b"
    assert_eq(d2, full.set_index("b"))

    d3 = d.set_index(d.b, npartitions=3)
    assert d3.npartitions == 3
    assert d3.index.name == "b"
    assert_eq(d3, full.set_index(full.b))

    d4 = d.set_index("b")
    assert d4.index.name == "b"
    assert_eq(d4, full.set_index("b"))

    d5 = d.set_index(["b"])
    assert d5.index.name == "b"
    assert_eq(d5, full.set_index(["b"]))
예제 #4
0
def _futures_to_dask_dataframe(futures, divisions=None, client=None):
    import dask.dataframe as dd

    client = default_client(client)
    f = yield _first_completed(futures)
    empty = client.submit(get_empty, f)
    if divisions is True:
        divisions = client.map(index_min, futures)
        divisions.append(client.submit(index_max, futures[-1]))
        divisions2 = yield client._gather(divisions)
        if sorted(divisions2) != divisions2:
            divisions2 = [None] * (len(futures) + 1)
    elif divisions in (None, False):
        divisions2 = [None] * (len(futures) + 1)
    else:
        raise NotImplementedError()
    empty = yield empty

    name = 'distributed-pandas-to-dask-' + tokenize(*futures)
    dsk = {(name, i): f for i, f in enumerate(futures)}

    ensure_default_get(client)

    raise gen.Return(dd.DataFrame(dsk, name, empty, divisions2))
예제 #5
0
def test_reductions(split_every):
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())
        assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
        assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
        assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every),
                  pds.sum(skipna=False))
        assert_eq(dds.prod(skipna=False, split_every=split_every),
                  pds.prod(skipna=False))
        assert_eq(dds.min(skipna=False, split_every=split_every),
                  pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every),
                  pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every),
                  pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every),
                  pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every),
                  pds.sem(skipna=False))
        assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every),
                  pds.std(skipna=False, ddof=0))
        assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every),
                  pds.var(skipna=False, ddof=0))
        assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every),
                  pds.sem(skipna=False, ddof=0))
        assert_eq(dds.mean(skipna=False, split_every=split_every),
                  pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum')
    assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod')
    assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min')
    assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max')
    assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count')
    assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every),
                      'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every),
                      'series-var')
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every),
                      'series-sem')
    assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every),
                      'drop-duplicates')

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every),
              pd.notnull(pdf1.index).sum())
예제 #6
0
def test_reductions_frame(split_every):
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert_eq(ddf1.sum(split_every=split_every), pdf1.sum())
    assert_eq(ddf1.prod(split_every=split_every), pdf1.prod())
    assert_eq(ddf1.min(split_every=split_every), pdf1.min())
    assert_eq(ddf1.max(split_every=split_every), pdf1.max())
    assert_eq(ddf1.count(split_every=split_every), pdf1.count())
    assert_eq(ddf1.std(split_every=split_every), pdf1.std())
    assert_eq(ddf1.var(split_every=split_every), pdf1.var())
    assert_eq(ddf1.sem(split_every=split_every), pdf1.sem())
    assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0))
    assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0))
    assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0))
    assert_eq(ddf1.mean(split_every=split_every), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert_eq(ddf1.sum(axis=axis, split_every=split_every),
                  pdf1.sum(axis=axis))
        assert_eq(ddf1.prod(axis=axis, split_every=split_every),
                  pdf1.prod(axis=axis))
        assert_eq(ddf1.min(axis=axis, split_every=split_every),
                  pdf1.min(axis=axis))
        assert_eq(ddf1.max(axis=axis, split_every=split_every),
                  pdf1.max(axis=axis))
        assert_eq(ddf1.count(axis=axis, split_every=split_every),
                  pdf1.count(axis=axis))
        assert_eq(ddf1.std(axis=axis, split_every=split_every),
                  pdf1.std(axis=axis))
        assert_eq(ddf1.var(axis=axis, split_every=split_every),
                  pdf1.var(axis=axis))
        assert_eq(ddf1.sem(axis=axis, split_every=split_every),
                  pdf1.sem(axis=axis))
        assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every),
                  pdf1.std(axis=axis, ddof=0))
        assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every),
                  pdf1.var(axis=axis, ddof=0))
        assert_eq(ddf1.sem(axis=axis, ddof=0, split_every=split_every),
                  pdf1.sem(axis=axis, ddof=0))
        assert_eq(ddf1.mean(axis=axis, split_every=split_every),
                  pdf1.mean(axis=axis))

    pytest.raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.prod(split_every=split_every), 'dataframe-prod')
    assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min')
    assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max')
    assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count')
    # std, var, sem, and mean consist of sum and count operations
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1, split_every=split_every),
                      'dataframe-sum')
    assert_dask_graph(ddf1.prod(axis=1, split_every=split_every),
                      'dataframe-prod')
    assert_dask_graph(ddf1.min(axis=1, split_every=split_every),
                      'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1, split_every=split_every),
                      'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1, split_every=split_every),
                      'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1, split_every=split_every),
                      'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1, split_every=split_every),
                      'dataframe-var')
    assert_dask_graph(ddf1.sem(axis=1, split_every=split_every),
                      'dataframe-sem')
    assert_dask_graph(ddf1.mean(axis=1, split_every=split_every),
                      'dataframe-mean')
예제 #7
0
dsk = {
    ("x", 0): pd.DataFrame({
        "a": [1, 2, 3],
        "b": [4, 5, 6]
    }, index=[0, 1, 3]),
    ("x", 1): pd.DataFrame({
        "a": [4, 5, 6],
        "b": [3, 2, 1]
    }, index=[5, 6, 8]),
    ("x", 2): pd.DataFrame({
        "a": [7, 8, 9],
        "b": [0, 0, 0]
    }, index=[9, 9, 9]),
}
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
    CHECK_FREQ["check_freq"] = False


def test_loc():
    assert d.loc[3:8].divisions[0] == 3
    assert d.loc[3:8].divisions[-1] == 8

    assert d.loc[5].divisions == (5, 5)

    assert_eq(d.loc[5], full.loc[5:5])
    assert_eq(d.loc[3:8], full.loc[3:8])
    assert_eq(d.loc[:8], full.loc[:8])
예제 #8
0
def test_get_numeric_data_unknown_part():
    df = pd.DataFrame({'a': range(5), 'b': range(5), 'c': list('abcde')})
    ddf = dd.from_pandas(df, 3)
    # Drop dtype information
    ddf = dd.DataFrame(ddf.dask, ddf._name, ['a', 'b', 'c'], ddf.divisions)
    assert eq(ddf._get_numeric_data(), df._get_numeric_data())
예제 #9
0
    def to_dask(self, pages=None, persist=False, progress=True):
        try:
            import dask

        except ImportError:
            raise RuntimeError("Dask is not installed.")
        if progress:
            from dask.diagnostics import ProgressBar
            ProgressBar().register()
        if pages is None:
            pages = self.page_numbers
        columns = [(k, DASK_TYPE_MAPPING[v.get("type", 'string')])
                   for k, v in self.schema.items()
                   if k in self.fields and not k.startswith("_")]
        column_types = dict(columns)

        url = self._url
        client_kwargs = self.session.get_client_kwargs()
        if client_kwargs["app"] is not None:
            client_kwargs["app"] = dict(client_kwargs["app"].config)

        def get_data(params):
            import httpx
            if client_kwargs["app"] is not None:
                from eve import Eve
                client_kwargs["app"] = Eve(settings=client_kwargs["app"])
            items = []
            with httpx.Client(**client_kwargs) as client:
                try:
                    resp = client.get(
                        url,
                        params=params,
                    )
                    items = resp.json().get("_items", [])
                except:
                    pass
            data = [{
                k: column_types[k](v)
                for k, v in item.items() if k in column_types
            } for item in items]
            return data

        if not self.is_tabular:
            import dask.bag as db
            return db.from_sequence([self.get_page_kwargs(i)
                                     for i in pages]).map(get_data).flatten()

        import dask.dataframe as dd
        import pandas as pd

        def get_df(params):
            data = get_data(params)
            return pd.DataFrame(data, columns=list(column_types))

        dask_name = str(
            hash((self.name, ) + tuple(self.get_page_kwargs(1).values())))
        dsk = {(dask_name, i - 1): (get_df, self.get_page_kwargs(i))
               for i in pages}

        nitems = self.nitems
        divisions = list(range(0, nitems, self.items_per_page))
        if nitems not in divisions:
            divisions = divisions + [nitems]

        df = dd.DataFrame(dsk, dask_name, columns, divisions)
        if persist:
            return df.persist()
        return df
예제 #10
0
def test_reductions(split_every):
    dsk = {
        ("x", 0): pd.DataFrame(
            {"a": [1, 2, 3], "b": [4, 5, 6], "c": [True, True, False]}, index=[0, 1, 3]
        ),
        ("x", 1): pd.DataFrame(
            {"a": [4, 5, 6], "b": [3, 2, 1], "c": [False, False, False]},
            index=[5, 6, 8],
        ),
        ("x", 2): pd.DataFrame(
            {
                "a": [13094304034, 3489385935, 100006774],
                "b": [0, 0, 0],
                "c": [True, True, True],
            },
            index=[9, 9, 9],
        ),
    }
    meta = make_meta({"a": "i8", "b": "i8", "c": "bool"}, index=pd.Index([], "i8"))
    ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [
        (ddf1.a, pdf1.a),
        (ddf1.b, pdf1.b),
        (ddf1.c, pdf1.c),
        (ddf1["a"], pdf1["a"]),
        (ddf1["b"], pdf1["b"]),
        (nands1, nans1),
        (nands2, nans2),
        (nands3, nans3),
        (boolds, bools),
    ]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())

        if scipy:
            # pandas uses unbiased skew, need to correct for that
            n = pds.shape[0]
            bias_factor = (n * (n - 1)) ** 0.5 / (n - 2)
            assert_eq(dds.skew(), pds.skew() / bias_factor)

        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())

        with warnings.catch_warnings():
            # dask.dataframe should probably filter this, to match pandas, but
            # it seems quite difficult.
            warnings.simplefilter("ignore", RuntimeWarning)
            assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
            assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
            assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False))
        assert_eq(
            dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)
        )
        assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False))
        assert_eq(
            dds.std(skipna=False, ddof=0, split_every=split_every),
            pds.std(skipna=False, ddof=0),
        )
        assert_eq(
            dds.var(skipna=False, ddof=0, split_every=split_every),
            pds.var(skipna=False, ddof=0),
        )
        assert_eq(
            dds.sem(skipna=False, ddof=0, split_every=split_every),
            pds.sem(skipna=False, ddof=0),
        )
        assert_eq(
            dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)
        )

    assert_dask_graph(ddf1.b.sum(split_every=split_every), "series-sum")
    assert_dask_graph(ddf1.b.prod(split_every=split_every), "series-prod")
    assert_dask_graph(ddf1.b.min(split_every=split_every), "series-min")
    assert_dask_graph(ddf1.b.max(split_every=split_every), "series-max")
    assert_dask_graph(ddf1.b.count(split_every=split_every), "series-count")
    assert_dask_graph(ddf1.b.std(split_every=split_every), "series-std")
    assert_dask_graph(ddf1.b.var(split_every=split_every), "series-var")
    assert_dask_graph(ddf1.b.sem(split_every=split_every), "series-sem")
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), "series-std")
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), "series-var")
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), "series-sem")
    assert_dask_graph(ddf1.b.mean(split_every=split_every), "series-mean")
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every), "drop-duplicates")

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
예제 #11
0
def test_concat2():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        }),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        }),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        })
    }
    a = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None])
    dsk = {
        ('y', 0): pd.DataFrame({
            'a': [10, 20, 30],
            'b': [40, 50, 60]
        }),
        ('y', 1): pd.DataFrame({
            'a': [40, 50, 60],
            'b': [30, 20, 10]
        }),
        ('y', 2): pd.DataFrame({
            'a': [70, 80, 90],
            'b': [0, 0, 0]
        })
    }
    b = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None])

    dsk = {
        ('y', 0): pd.DataFrame({
            'b': [10, 20, 30],
            'c': [40, 50, 60]
        }),
        ('y', 1): pd.DataFrame({
            'b': [40, 50, 60],
            'c': [30, 20, 10]
        })
    }
    c = dd.DataFrame(dsk, 'y', ['b', 'c'], [None, None])

    dsk = {
        ('y', 0):
        pd.DataFrame({
            'b': [10, 20, 30],
            'c': [40, 50, 60],
            'd': [70, 80, 90]
        }),
        ('y', 1):
        pd.DataFrame({
            'b': [40, 50, 60],
            'c': [30, 20, 10],
            'd': [90, 80, 70]
        },
                     index=[3, 4, 5])
    }
    d = dd.DataFrame(dsk, 'y', ['b', 'c', 'd'], [0, 3, 5])

    cases = [[a, b], [a, c], [a, d]]
    assert dd.concat([a]) is a
    for case in cases:
        result = dd.concat(case)
        pdcase = [c.compute() for c in case]

        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert eq(pd.concat(pdcase), result)
        assert result.dask == dd.concat(case).dask

        result = dd.concat(case, join='inner')
        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert eq(pd.concat(pdcase, join='inner'), result)
        assert result.dask == dd.concat(case, join='inner').dask

        msg = ('Unable to concatenate DataFrame with unknown division '
               'specifying axis=1')
        with tm.assertRaisesRegexp(ValueError, msg):
            dd.concat(case, axis=1)
예제 #12
0
def test_append2():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        }),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        }),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        })
    }
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None])

    dsk = {
        ('y', 0): pd.DataFrame({
            'a': [10, 20, 30],
            'b': [40, 50, 60]
        }),
        ('y', 1): pd.DataFrame({
            'a': [40, 50, 60],
            'b': [30, 20, 10]
        }),
        ('y', 2): pd.DataFrame({
            'a': [70, 80, 90],
            'b': [0, 0, 0]
        })
    }
    ddf2 = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None])

    dsk = {
        ('y', 0): pd.DataFrame({
            'b': [10, 20, 30],
            'c': [40, 50, 60]
        }),
        ('y', 1): pd.DataFrame({
            'b': [40, 50, 60],
            'c': [30, 20, 10]
        })
    }
    ddf3 = dd.DataFrame(dsk, 'y', ['b', 'c'], [None, None])

    assert eq(ddf1.append(ddf2), ddf1.compute().append(ddf2.compute()))
    assert eq(ddf2.append(ddf1), ddf2.compute().append(ddf1.compute()))
    # Series + DataFrame
    assert eq(ddf1.a.append(ddf2), ddf1.a.compute().append(ddf2.compute()))
    assert eq(ddf2.a.append(ddf1), ddf2.a.compute().append(ddf1.compute()))

    # different columns
    assert eq(ddf1.append(ddf3), ddf1.compute().append(ddf3.compute()))
    assert eq(ddf3.append(ddf1), ddf3.compute().append(ddf1.compute()))
    # Series + DataFrame
    assert eq(ddf1.a.append(ddf3), ddf1.a.compute().append(ddf3.compute()))
    assert eq(ddf3.b.append(ddf1), ddf3.b.compute().append(ddf1.compute()))

    # Dask + pandas
    assert eq(ddf1.append(ddf2.compute()),
              ddf1.compute().append(ddf2.compute()))
    assert eq(ddf2.append(ddf1.compute()),
              ddf2.compute().append(ddf1.compute()))
    # Series + DataFrame
    assert eq(ddf1.a.append(ddf2.compute()),
              ddf1.a.compute().append(ddf2.compute()))
    assert eq(ddf2.a.append(ddf1.compute()),
              ddf2.a.compute().append(ddf1.compute()))

    # different columns
    assert eq(ddf1.append(ddf3.compute()),
              ddf1.compute().append(ddf3.compute()))
    assert eq(ddf3.append(ddf1.compute()),
              ddf3.compute().append(ddf1.compute()))
    # Series + DataFrame
    assert eq(ddf1.a.append(ddf3.compute()),
              ddf1.a.compute().append(ddf3.compute()))
    assert eq(ddf3.b.append(ddf1.compute()),
              ddf3.b.compute().append(ddf1.compute()))
예제 #13
0
def dataframe_factory(out_ind, *arginds, columns=None):
    """
    Creates a dask Dataframe by broadcasting  *arginds
    against each other and then ravelling them.

    .. code-block:: python

        df = dataframe_factory(("row", "chan"),
                               x, ("row",),
                               y, ("chan",))

    Parameters
    ----------
    out_ind : sequence
        Output dimensions.
        e.g. :code:`(row, chan)`
    *arginds : Sequence of (:class:`dask.array.Array`, index)
        document me
    columns : sequence, optional
        Dataframe column names.
        Defaults to :code:`[x, y]`
    """
    if not len(arginds) % 2 == 0:
        raise ValueError("Must supply an index for each argument")

    args = arginds[::2]
    inds = arginds[1::2]

    if columns is None:
        columns = ['x', 'y'] + ["c%d" % i for i in range(len(args) - 2)]
    else:
        if (not isinstance(columns, (tuple, list))
                and len(columns) != len(args)):

            raise ValueError("Columns must be a tuple/list of columns "
                             "matching the number of arrays")

    have_nan_chunks = False

    new_args = []

    for a, (arg, ind) in enumerate(zip(args, inds)):
        if not all(i in out_ind for i in ind):
            raise ValueError("Argument %d dimensions not in out_ind" % a)

        if not len(ind) == arg.ndim:
            raise ValueError("Argument %d len(ind) != arg.ndim" % a)

        have_nan_chunks = (any(np.isnan(c) for dc in arg.chunks for c in dc)
                           or have_nan_chunks)

        # Generate slicing tuple that will expand arg up to full resolution
        expand = tuple(slice(None) if i in ind else None for i in out_ind)
        new_args.append(arg[expand])

    # Create meta data so that blockwise doesn't call
    # np.broadcast_arrays and fall over on the tuple
    # of arrays that it returns
    dtype = np.result_type(*args)
    meta = np.empty((0, ) * len(out_ind), dtype=dtype)

    blockargs = (v for pair in ((a, out_ind) for a in new_args) for v in pair)

    bcast = da.blockwise(np.broadcast_arrays,
                         out_ind,
                         *blockargs,
                         subok=True,
                         align_arrays=not have_nan_chunks,
                         meta=meta,
                         dtype=dtype)

    # Now create a dataframe from the broadcasted arrays
    # with lower-level dask graph API

    # Flattened list of broadcast array keys
    # We'll use this to generate a 1D (ravelled) dataframe
    keys = product((bcast.name, ), *(range(b) for b in bcast.numblocks))
    name = "dataframe-" + tokenize(bcast)

    # dictionary defining the graph for this part of the operation
    layers = {}

    if have_nan_chunks:
        # We can't create proper indices if we don't known our chunk sizes
        divisions = [None]

        for i, key in enumerate(keys):
            layers[(name, i)] = (_create_dataframe, key, None, None, columns)
            divisions.append(None)
    else:
        # We do know all our chunk sizes, create reasonable dataframe indices
        start_idx = 0
        divisions = [0]

        expr = ((e - s for s, e in start_ends(dim_chunks))
                for dim_chunks in bcast.chunks)
        chunk_sizes = (reduce(mul, shape, 1) for shape in product(*expr))
        chunk_ranges = start_ends(chunk_sizes)

        for i, (key, (start, end)) in enumerate(zip(keys, chunk_ranges)):
            layers[(name, i)] = (_create_dataframe, key, start, end, columns)
            start_idx += end - start
            divisions.append(start_idx)

    assert len(layers) == bcast.npartitions
    assert len(divisions) == bcast.npartitions + 1

    # Create the HighLevelGraph
    graph = HighLevelGraph.from_collections(name, layers, [bcast])
    # Metadata representing the broadcasted and ravelled data

    meta = pd.DataFrame(data={
        k: np.empty((0, ), dtype=a.dtype)
        for k, a in zip(columns, args)
    },
                        columns=columns)

    # Create the actual Dataframe
    return dd.DataFrame(graph, name, meta=meta, divisions=divisions)
예제 #14
0
def multicol_dataframe_factory(out_ind, arrays, array_dims):
    """
    Creates a dask Dataframe by broadcasting arrays (given by the arrays dict-like object)
    against each other and then ravelling them. The array_indices mapping specifies which indices
    the arrays have

    .. code-block:: python

        df = dataframe_factory(("row", "chan"), {'x': x, 'y': y}, {x: ("row",), y: ("chan",)})

    Parameters
    ----------
    out_ind : sequence
        Output dimensions.
        e.g. :code:`(row, chan)`
    """
    columns = list(arrays.keys())

    have_nan_chunks = None
    expand = {}
    barr = {}
    # build up list of arguments for blockwise call below
    blockwise_args = [np.broadcast_arrays, out_ind]

    for col, arr in arrays.items():
        if col not in array_dims:
            raise ValueError(f"{col} dimensions not specified")
        arr_ind = array_dims[col]
        if not all(i in out_ind for i in arr_ind):
            raise ValueError(f"{col} dimensions not in out_ind")
        if not len(arr_ind) == arr.ndim:
            raise ValueError(f"len({col}_ind) != {col}.ndim")
        have_nan_chunks = have_nan_chunks or any(
            np.isnan(c) for dc in arr.chunks for c in dc)

        # Generate slicing tuples that will expand arr up to the full
        # resolution
        expand[col] = tuple(
            slice(None) if i in arr_ind else None for i in out_ind)
        # broadcast vesion of array
        barr[col] = arr[expand[col]]

        blockwise_args += [barr[col], out_ind]

    # Create meta data so that blockwise doesn't call
    # np.broadcast_arrays and fall over on the tuple
    # of arrays that it returns
    dtype = np.result_type(*arrays.values())
    meta = np.empty((0, ) * len(out_ind), dtype=dtype)

    bcast = da.blockwise(*blockwise_args,
                         align_arrays=not have_nan_chunks,
                         meta=meta,
                         dtype=dtype)

    # Now create a dataframe from the broadcasted arrays
    # with lower-level dask graph API

    # Flattened list of broadcast array keys
    # We'll use this to generate a 1D (ravelled) dataframe
    keys = product((bcast.name, ), *(range(b) for b in bcast.numblocks))
    name = "dataframe-" + tokenize(bcast)

    # dictionary defining the graph for this part of the operation
    layers = {}

    if have_nan_chunks:
        # We can't create proper indices if we don't known our chunk sizes
        divisions = [None]

        for i, key in enumerate(keys):
            layers[(name, i)] = (_create_dataframe, key, None, None)
            divisions.append(None)
    else:
        # We do know all our chunk sizes, create reasonable dataframe indices
        start_idx = 0
        divisions = [0]

        expr = ((e - s for s, e in start_ends(dim_chunks))
                for dim_chunks in bcast.chunks)
        chunk_sizes = (reduce(mul, shape, 1) for shape in product(*expr))
        chunk_ranges = start_ends(chunk_sizes)

        for i, (key, (start, end)) in enumerate(zip(keys, chunk_ranges)):
            layers[(name, i)] = (_create_dataframe, key, start, end)
            start_idx += end - start
            divisions.append(start_idx)

    assert len(layers) == bcast.npartitions
    assert len(divisions) == bcast.npartitions + 1

    # Create the HighLevelGraph
    graph = HighLevelGraph.from_collections(name, layers, [bcast])
    # Metadata representing the broadcasted and ravelled data
    meta = pd.DataFrame(data={
        col: np.empty((0, ), dtype=arr.dtype)
        for col, arr in arrays.items()
    },
                        columns=columns)

    # Create the actual Dataframe
    return dd.DataFrame(graph, name, meta=meta, divisions=divisions)
예제 #15
0
def test_reductions_frame(split_every):
    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
        ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
        ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
    }
    meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
    ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert_eq(ddf1.sum(split_every=split_every), pdf1.sum())
    assert_eq(ddf1.prod(split_every=split_every), pdf1.prod())
    assert_eq(ddf1.min(split_every=split_every), pdf1.min())
    assert_eq(ddf1.max(split_every=split_every), pdf1.max())
    assert_eq(ddf1.count(split_every=split_every), pdf1.count())
    assert_eq(ddf1.std(split_every=split_every), pdf1.std())
    assert_eq(ddf1.var(split_every=split_every), pdf1.var())
    assert_eq(ddf1.sem(split_every=split_every), pdf1.sem())
    assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0))
    assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0))
    assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0))
    assert_eq(ddf1.mean(split_every=split_every), pdf1.mean())

    for axis in [0, 1, "index", "columns"]:
        assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis))
        assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis))
        assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis))
        assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis))
        assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis))
        assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis))
        assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis))
        assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis))
        assert_eq(
            ddf1.std(axis=axis, ddof=0, split_every=split_every),
            pdf1.std(axis=axis, ddof=0),
        )
        assert_eq(
            ddf1.var(axis=axis, ddof=0, split_every=split_every),
            pdf1.var(axis=axis, ddof=0),
        )
        assert_eq(
            ddf1.sem(axis=axis, ddof=0, split_every=split_every),
            pdf1.sem(axis=axis, ddof=0),
        )
        assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis))

    pytest.raises(ValueError, lambda: ddf1.sum(axis="incorrect").compute())

    # axis=0
    assert_dask_graph(ddf1.sum(split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.prod(split_every=split_every), "dataframe-prod")
    assert_dask_graph(ddf1.min(split_every=split_every), "dataframe-min")
    assert_dask_graph(ddf1.max(split_every=split_every), "dataframe-max")
    assert_dask_graph(ddf1.count(split_every=split_every), "dataframe-count")

    # std, var, sem, and mean consist of moment_* operations
    assert_dask_graph(ddf1.std(split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.std(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.std(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.std(split_every=split_every), "values")

    assert_dask_graph(ddf1.var(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.var(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.var(split_every=split_every), "values")

    assert_dask_graph(ddf1.sem(split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.sem(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.sem(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.sem(split_every=split_every), "values")

    assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-count")

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), "dataframe-prod")
    assert_dask_graph(ddf1.min(axis=1, split_every=split_every), "dataframe-min")
    assert_dask_graph(ddf1.max(axis=1, split_every=split_every), "dataframe-max")
    assert_dask_graph(ddf1.count(axis=1, split_every=split_every), "dataframe-count")
    assert_dask_graph(ddf1.std(axis=1, split_every=split_every), "dataframe-std")
    assert_dask_graph(ddf1.var(axis=1, split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), "dataframe-sem")
    assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), "dataframe-mean")
예제 #16
0
def test_arithmetics():
    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
        ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
        ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
    }
    meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
    ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    pdf2 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]})
    pdf3 = pd.DataFrame({"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]})
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    dsk4 = {
        ("y", 0): pd.DataFrame({"a": [3, 2, 1], "b": [7, 8, 9]}, index=[0, 1, 3]),
        ("y", 1): pd.DataFrame({"a": [5, 2, 8], "b": [4, 2, 3]}, index=[5, 6, 8]),
        ("y", 2): pd.DataFrame({"a": [1, 4, 10], "b": [1, 0, 5]}, index=[9, 9, 9]),
    }
    ddf4 = dd.DataFrame(dsk4, "y", meta, [0, 4, 9, 9])
    pdf4 = ddf4.compute()

    # Arithmetics
    cases = [
        (ddf1, ddf1, pdf1, pdf1),
        (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1),
        (ddf2, ddf3, pdf2, pdf3),
        (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3),
        (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3),
        (ddf1, ddf4, pdf1, pdf4),
        (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4),
        (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4),
        # dask + pandas
        (ddf1, pdf4, pdf1, pdf4),
        (ddf2, pdf3, pdf2, pdf3),
    ]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a, r.b, el.a, er.b)
        check_frame_arithmetics(l, r, el, er)

    # different index, pandas raises ValueError in comparison ops

    pdf5 = pd.DataFrame(
        {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 4, 2, 3, 1, 0, 5]},
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
    )
    ddf5 = dd.from_pandas(pdf5, 2)

    pdf6 = pd.DataFrame(
        {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 5, 7, 8, 4, 2, 5]},
        index=[0, 1, 2, 3, 4, 5, 6, 7, 9],
    )
    ddf6 = dd.from_pandas(pdf6, 4)

    pdf7 = pd.DataFrame(
        {"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]},
        index=list("aaabcdeh"),
    )
    pdf8 = pd.DataFrame(
        {"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]},
        index=list("abcdefgh"),
    )
    ddf7 = dd.from_pandas(pdf7, 3)
    ddf8 = dd.from_pandas(pdf8, 4)

    pdf9 = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, 5, 6, 7, 8],
            "b": [5, 6, 7, 8, 1, 2, 3, 4],
            "c": [5, 6, 7, 8, 1, 2, 3, 4],
        },
        index=list("aaabcdeh"),
    )
    pdf10 = pd.DataFrame(
        {
            "b": [5, 6, 7, 8, 4, 3, 2, 1],
            "c": [2, 4, 5, 3, 4, 2, 1, 0],
            "d": [2, 4, 5, 3, 4, 2, 1, 0],
        },
        index=list("abcdefgh"),
    )
    ddf9 = dd.from_pandas(pdf9, 3)
    ddf10 = dd.from_pandas(pdf10, 4)

    # Arithmetics with different index
    cases = [
        (ddf5, ddf6, pdf5, pdf6),
        (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6),
        (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6),
        (ddf7, ddf8, pdf7, pdf8),
        (ddf7.repartition(["a", "c", "h"]), ddf8.repartition(["a", "h"]), pdf7, pdf8),
        (
            ddf7.repartition(["a", "b", "e", "h"]),
            ddf8.repartition(["a", "e", "h"]),
            pdf7,
            pdf8,
        ),
        (ddf9, ddf10, pdf9, pdf10),
        (ddf9.repartition(["a", "c", "h"]), ddf10.repartition(["a", "h"]), pdf9, pdf10),
        # dask + pandas
        (ddf5, pdf6, pdf5, pdf6),
        (ddf7, pdf8, pdf7, pdf8),
        (ddf9, pdf10, pdf9, pdf10),
    ]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False)
        check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
예제 #17
0
# -*- coding:utf-8 -*-
"""
  Author  : 'longguangbin'
  Contact : [email protected]
  Date    : 2018/11/23
  Usage   :
"""

import dask.dataframe as dd

df = dd.DataFrame([[1, 2, 3], [2, 3, 4], [3, 4, 5], [5, 5, 6]])
df.head()
예제 #18
0
파일: test_shuffle.py 프로젝트: qwshy/dask
dsk = {
    ('x', 0): pd.DataFrame({
        'a': [1, 2, 3],
        'b': [1, 4, 7]
    }, index=[0, 1, 3]),
    ('x', 1): pd.DataFrame({
        'a': [4, 5, 6],
        'b': [2, 5, 8]
    }, index=[5, 6, 8]),
    ('x', 2): pd.DataFrame({
        'a': [7, 8, 9],
        'b': [3, 6, 9]
    }, index=[9, 9, 9])
}
d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
full = d.compute()


def test_shuffle():
    s = shuffle(d, d.b, npartitions=2)
    assert isinstance(s, dd.DataFrame)
    assert s.npartitions == 2

    x = get_sync(s.dask, (s._name, 0))
    y = get_sync(s.dask, (s._name, 1))

    assert not (set(x.b) & set(y.b))  # disjoint

    assert shuffle(d, d.b, npartitions=2)._name == shuffle(d,
                                                           d.b,
예제 #19
0
def test_fast_functions():
    df = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None, None, None])
    e = df.a + df.b
    assert len(e.dask) > 6

    assert len(dd.optimize(e.dask, e._keys())) == 6
예제 #20
0
파일: join.py 프로젝트: nils-braun/dask-sql
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Joining is a bit more complicated, so lets do it in steps:

        # 1. We now have two inputs (from left and right), so we fetch them both
        dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context)
        cc_lhs = dc_lhs.column_container
        cc_rhs = dc_rhs.column_container

        # 2. dask's merge will do some smart things with columns, which have the same name
        # on lhs an rhs (which also includes reordering).
        # However, that will confuse our column numbering in SQL.
        # So we make our life easier by converting the column names into unique names
        # We will convert back in the end
        cc_lhs_renamed = cc_lhs.make_unique("lhs")
        cc_rhs_renamed = cc_rhs.make_unique("rhs")

        dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed)
        dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed)

        df_lhs_renamed = dc_lhs_renamed.assign()
        df_rhs_renamed = dc_rhs_renamed.assign()

        join_type = rel.getJoinType()
        join_type = self.JOIN_TYPE_MAPPING[str(join_type)]

        # 3. The join condition can have two forms, that we can understand
        # (a) a = b
        # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b)
        # The first case is very simple and we do not need any additional filter
        # In the second case we do a merge on all the a = b,
        # and then apply a filter using the other expressions.
        # In all other cases, we need to do a full table cross join and filter afterwards.
        # As this is probably non-sense for large tables, but there is no other
        # known solution so far.
        join_condition = rel.getCondition()
        lhs_on, rhs_on, filter_condition = self._split_join_condition(
            join_condition)

        logger.debug(
            f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.")

        # lhs_on and rhs_on are the indices of the columns to merge on.
        # The given column indices are for the full, merged table which consists
        # of lhs and rhs put side-by-side (in this order)
        # We therefore need to normalize the rhs indices relative to the rhs table.
        rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on]

        # 4. dask can only merge on the same column names.
        # We therefore create new columns on purpose, which have a distinct name.
        assert len(lhs_on) == len(rhs_on)
        if lhs_on:
            # 5. Now we can finally merge on these columns
            # The resulting dataframe will contain all (renamed) columns from the lhs and rhs
            # plus the added columns
            df = self._join_on_columns(
                df_lhs_renamed,
                df_rhs_renamed,
                lhs_on,
                rhs_on,
                join_type,
            )
        else:
            # 5. We are in the complex join case
            # where we have no column to merge on
            # This means we have no other chance than to merge
            # everything with everything...

            # TODO: we should implement a shortcut
            # for filter conditions that are always false

            def merge_single_partitions(lhs_partition, rhs_partition):
                # Do a cross join with the two partitions
                # TODO: it would be nice to apply the filter already here
                # problem: this would mean we need to ship the rex to the
                # workers (as this is executed on the workers),
                # which is definitely not possible (java dependency, JVM start...)
                lhs_partition = lhs_partition.assign(common=1)
                rhs_partition = rhs_partition.assign(common=1)

                return lhs_partition.merge(rhs_partition,
                                           on="common").drop(columns="common")

            # Iterate nested over all partitions from lhs and rhs and merge them
            name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
            dsk = {(name, i * df_rhs_renamed.npartitions + j): (
                merge_single_partitions,
                (df_lhs_renamed._name, i),
                (df_rhs_renamed._name, j),
            )
                   for i in range(df_lhs_renamed.npartitions)
                   for j in range(df_rhs_renamed.npartitions)}

            graph = HighLevelGraph.from_collections(
                name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed])

            meta = dd.dispatch.concat(
                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty],
                axis=1)
            # TODO: Do we know the divisions in any way here?
            divisions = [None] * (len(dsk) + 1)
            df = dd.DataFrame(graph, name, meta=meta, divisions=divisions)

            warnings.warn(
                "Need to do a cross-join, which is typically very resource heavy",
                ResourceWarning,
            )

        # 6. So the next step is to make sure
        # we have the correct column order (and to remove the temporary join columns)
        correct_column_order = list(df_lhs_renamed.columns) + list(
            df_rhs_renamed.columns)
        cc = ColumnContainer(df.columns).limit_to(correct_column_order)

        # and to rename them like the rel specifies
        row_type = rel.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.rename({
            from_col: to_col
            for from_col, to_col in zip(cc.columns, field_specifications)
        })
        cc = self.fix_column_to_row_type(cc, row_type)
        dc = DataContainer(df, cc)

        # 7. Last but not least we apply any filters by and-chaining together the filters
        if filter_condition:
            # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate
            filter_condition = reduce(
                operator.and_,
                [
                    RexConverter.convert(rex, dc, context=context)
                    for rex in filter_condition
                ],
            )
            logger.debug(f"Additionally applying filter {filter_condition}")
            df = filter_or_scalar(df, filter_condition)
            dc = DataContainer(df, cc)

        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
예제 #21
0
 def fit(self, X, y=None):
     if(isinstance(X, pd.DataFrame)):
         X = dd.DataFrame(X)
     return self
예제 #22
0
def test_categorize():
    # rename y to y_ to avoid pandas future warning about ambiguous
    # levels
    meta = clear_known_categories(frames4[0]).rename(columns={"y": "y_"})
    ddf = dd.DataFrame(
        {("unknown", i): df for (i, df) in enumerate(frames3)},
        "unknown",
        meta,
        [None] * 4,
    ).rename(columns={"y": "y_"})
    ddf = ddf.assign(w=ddf.w.cat.set_categories(["x", "y", "z"]))
    assert ddf.w.cat.known
    assert not ddf.y_.cat.known
    assert not ddf.index.cat.known
    df = ddf.compute()

    for index in [None, True, False]:
        known_index = index is not False
        # By default categorize object and unknown cat columns
        ddf2 = ddf.categorize(index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False)

        # Specifying split_every works
        ddf2 = ddf.categorize(index=index, split_every=2)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False)

        # Specifying one column doesn't affect others
        ddf2 = ddf.categorize("v", index=index)
        assert not ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False)

        ddf2 = ddf.categorize("y_", index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.dtype == "object"
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df)

    ddf_known_index = ddf.categorize(columns=[], index=True)
    assert ddf_known_index.index.cat.known
    assert_eq(ddf_known_index, df)

    # Specifying known categorical or no columns is a no-op:
    assert ddf.categorize(["w"], index=False) is ddf
    assert ddf.categorize([], index=False) is ddf
    assert ddf_known_index.categorize(["w"]) is ddf_known_index
    assert ddf_known_index.categorize([]) is ddf_known_index

    # Bad split_every fails
    with pytest.raises(ValueError):
        ddf.categorize(split_every=1)

    with pytest.raises(ValueError):
        ddf.categorize(split_every="foo")
예제 #23
0
def read_pairix_block(
    filepath,
    block,
    names=None,
    dtypes=None,
    usecols=None,
    chromsizes=None,
    chunk_level=0,
):
    if chromsizes is None:
        f = pypairix.open(filepath)
        cs = f.get_chromsize()
        if not len(cs):
            raise ValueError(
                "No chromsize headers found in file. "
                "They must be provided explicitly."
            )
        chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs]))
        del f

    chrom1, chrom2 = block
    nrows = chromsizes[chrom1]

    meta = (
        pd.read_csv(
            filepath,
            sep="\t",
            comment="#",
            header=None,
            names=names,
            dtype=dtypes,
            usecols=usecols,
            iterator=True,
        )
        .read(1024)
        .iloc[0:0]
    )

    # Make a unique task name
    token = tokenize(filepath, chromsizes, block, names, dtypes, usecols, chunk_level)
    task_name = "read-pairix-block-" + token

    # Build the task graph
    divisions = []
    dsk = {}
    edges = LEVEL[chunk_level]
    edges = edges[: np.searchsorted(edges, nrows)]
    if edges[-1] != nrows:
        edges = np.r_[edges, nrows]
    spans = zip(edges[:-1], edges[1:])
    for i, (lo, hi) in enumerate(spans):
        if i == 0:
            divisions.append(lo)
        divisions.append(hi - 1)
        slc = slice(lo, hi)
        dsk[task_name, i] = (
            _fetch_region,
            filepath,
            chromsizes,
            slc,
            block,
            names,
            usecols,
            meta,
        )

    # Generate ddf from dask graph
    return dd.DataFrame(dsk, task_name, meta, tuple(divisions))
예제 #24
0
                                    partitioning_index,
                                    rearrange_by_column,
                                    rearrange_by_divisions,
                                    maybe_buffered_partd,
                                    remove_nans)
from dask.dataframe.utils import assert_eq, make_meta


dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [1, 4, 7]},
                              index=[0, 1, 3]),
       ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [2, 5, 8]},
                              index=[5, 6, 8]),
       ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [3, 6, 9]},
                              index=[9, 9, 9])}
meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
full = d.compute()


shuffle_func = shuffle  # conflicts with keyword argument


@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])
def test_shuffle(shuffle):
    s = shuffle_func(d, d.b, shuffle=shuffle)
    assert isinstance(s, dd.DataFrame)
    assert s.npartitions == d.npartitions

    x = dask.get(s.dask, (s._name, 0))
    y = dask.get(s.dask, (s._name, 1))
예제 #25
0
def test_split_apply_combine_on_series():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 6],
            'b': [4, 2, 7]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 4, 6],
            'b': [3, 3, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [4, 3, 7],
            'b': [1, 1, 3]
        },
                               index=[9, 9, 9])
    }
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(
            ddf1.groupby(ddkey).a.nunique(),
            pdf1.groupby(pdkey).a.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(),
                  pdf1.a.groupby(pdkey).sum(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(),
                  pdf1.a.groupby(pdkey).max(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(),
                  pdf1.a.groupby(pdkey).count(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(),
                  pdf1.a.groupby(pdkey).mean(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(),
                  pdf1.a.groupby(pdkey).nunique(),
                  check_names=False)

    for i in range(8):
        assert eq(
            ddf1.groupby(ddf1.b > i).a.sum(),
            pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.min(),
            pdf1.groupby(pdf1.b > i).a.min())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.max(),
            pdf1.groupby(pdf1.b > i).a.max())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.count(),
            pdf1.groupby(pdf1.b > i).a.count())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.mean(),
            pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.nunique(),
            pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(
            ddf1.groupby(ddf1.a > i).b.sum(),
            pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.min(),
            pdf1.groupby(pdf1.a > i).b.min())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.max(),
            pdf1.groupby(pdf1.a > i).b.max())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.count(),
            pdf1.groupby(pdf1.a > i).b.count())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.mean(),
            pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.nunique(),
            pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(
            ddf1.groupby(ddf1.b > i).sum(),
            pdf1.groupby(pdf1.b > i).sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).min(),
            pdf1.groupby(pdf1.b > i).min())
        assert eq(
            ddf1.groupby(ddf1.b > i).max(),
            pdf1.groupby(pdf1.b > i).max())
        assert eq(
            ddf1.groupby(ddf1.b > i).count(),
            pdf1.groupby(pdf1.b > i).count())
        assert eq(
            ddf1.groupby(ddf1.b > i).mean(),
            pdf1.groupby(pdf1.b > i).mean())

        assert eq(
            ddf1.groupby(ddf1.a > i).sum(),
            pdf1.groupby(pdf1.a > i).sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).min(),
            pdf1.groupby(pdf1.a > i).min())
        assert eq(
            ddf1.groupby(ddf1.a > i).max(),
            pdf1.groupby(pdf1.a > i).max())
        assert eq(
            ddf1.groupby(ddf1.a > i).count(),
            pdf1.groupby(pdf1.a > i).count())
        assert eq(
            ddf1.groupby(ddf1.a > i).mean(),
            pdf1.groupby(pdf1.a > i).mean())

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(
            ddf1.groupby(ddkey).b.nunique(),
            pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(
            ddf1.groupby(ddkey).mean(),
            pdf1.groupby(pdkey).mean().astype(float))

    assert sorted(ddf1.groupby('b').a.sum().dask) == \
           sorted(ddf1.groupby('b').a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \
           sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
예제 #26
0
def test_arithmetics():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    pdf2 = pd.DataFrame({
        'a': [1, 2, 3, 4, 5, 6, 7, 8],
        'b': [5, 6, 7, 8, 1, 2, 3, 4]
    })
    pdf3 = pd.DataFrame({
        'a': [5, 6, 7, 8, 4, 3, 2, 1],
        'b': [2, 4, 5, 3, 4, 2, 1, 0]
    })
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    dsk4 = {
        ('y', 0): pd.DataFrame({
            'a': [3, 2, 1],
            'b': [7, 8, 9]
        },
                               index=[0, 1, 3]),
        ('y', 1): pd.DataFrame({
            'a': [5, 2, 8],
            'b': [4, 2, 3]
        },
                               index=[5, 6, 8]),
        ('y', 2): pd.DataFrame({
            'a': [1, 4, 10],
            'b': [1, 0, 5]
        },
                               index=[9, 9, 9])
    }
    ddf4 = dd.DataFrame(dsk4, 'y', meta, [0, 4, 9, 9])
    pdf4 = ddf4.compute()

    # Arithmetics
    cases = [
        (ddf1, ddf1, pdf1, pdf1),
        (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1),
        (ddf2, ddf3, pdf2, pdf3),
        (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3),
        (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5,
                                                     7]), pdf2, pdf3),
        (ddf1, ddf4, pdf1, pdf4),
        (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4),
        (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4),
        # dask + pandas
        (ddf1, pdf4, pdf1, pdf4),
        (ddf2, pdf3, pdf2, pdf3)
    ]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a, r.b, el.a, er.b)
        check_frame_arithmetics(l, r, el, er)

    # different index, pandas raises ValueError in comparison ops

    pdf5 = pd.DataFrame(
        {
            'a': [3, 2, 1, 5, 2, 8, 1, 4, 10],
            'b': [7, 8, 9, 4, 2, 3, 1, 0, 5]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf5 = dd.from_pandas(pdf5, 2)

    pdf6 = pd.DataFrame(
        {
            'a': [3, 2, 1, 5, 2, 8, 1, 4, 10],
            'b': [7, 8, 9, 5, 7, 8, 4, 2, 5]
        },
        index=[0, 1, 2, 3, 4, 5, 6, 7, 9])
    ddf6 = dd.from_pandas(pdf6, 4)

    pdf7 = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 6, 7, 8],
            'b': [5, 6, 7, 8, 1, 2, 3, 4]
        },
        index=list('aaabcdeh'))
    pdf8 = pd.DataFrame(
        {
            'a': [5, 6, 7, 8, 4, 3, 2, 1],
            'b': [2, 4, 5, 3, 4, 2, 1, 0]
        },
        index=list('abcdefgh'))
    ddf7 = dd.from_pandas(pdf7, 3)
    ddf8 = dd.from_pandas(pdf8, 4)

    pdf9 = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 6, 7, 8],
            'b': [5, 6, 7, 8, 1, 2, 3, 4],
            'c': [5, 6, 7, 8, 1, 2, 3, 4]
        },
        index=list('aaabcdeh'))
    pdf10 = pd.DataFrame(
        {
            'b': [5, 6, 7, 8, 4, 3, 2, 1],
            'c': [2, 4, 5, 3, 4, 2, 1, 0],
            'd': [2, 4, 5, 3, 4, 2, 1, 0]
        },
        index=list('abcdefgh'))
    ddf9 = dd.from_pandas(pdf9, 3)
    ddf10 = dd.from_pandas(pdf10, 4)

    # Arithmetics with different index
    cases = [
        (ddf5, ddf6, pdf5, pdf6),
        (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6),
        (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6),
        (ddf7, ddf8, pdf7, pdf8),
        (ddf7.repartition(['a', 'c',
                           'h']), ddf8.repartition(['a', 'h']), pdf7, pdf8),
        (ddf7.repartition(['a', 'b', 'e',
                           'h']), ddf8.repartition(['a', 'e',
                                                    'h']), pdf7, pdf8),
        (ddf9, ddf10, pdf9, pdf10),
        (ddf9.repartition(['a', 'c',
                           'h']), ddf10.repartition(['a', 'h']), pdf9, pdf10),
        # dask + pandas
        (ddf5, pdf6, pdf5, pdf6),
        (ddf7, pdf8, pdf7, pdf8),
        (ddf9, pdf10, pdf9, pdf10)
    ]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a,
                                 r.b,
                                 el.a,
                                 er.b,
                                 allow_comparison_ops=False)
        check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
예제 #27
0
def test_reductions_frame():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert eq(ddf1.sum(), pdf1.sum())
    assert eq(ddf1.min(), pdf1.min())
    assert eq(ddf1.max(), pdf1.max())
    assert eq(ddf1.count(), pdf1.count())
    assert eq(ddf1.std(), pdf1.std())
    assert eq(ddf1.var(), pdf1.var())
    assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0))
    assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0))
    assert eq(ddf1.mean(), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis))
        assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis))
        assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis))
        assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis))
        assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis))
        assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis))
        assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0))
        assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0))
        assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis))

    assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(), 'dataframe-sum')
    assert_dask_graph(ddf1.min(), 'dataframe-min')
    assert_dask_graph(ddf1.max(), 'dataframe-max')
    assert_dask_graph(ddf1.count(), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(ddf1.std(), 'dataframe-sum')
    assert_dask_graph(ddf1.std(), 'dataframe-count')
    assert_dask_graph(ddf1.var(), 'dataframe-sum')
    assert_dask_graph(ddf1.var(), 'dataframe-count')
    assert_dask_graph(ddf1.mean(), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum')
    assert_dask_graph(ddf1.min(axis=1), 'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1), 'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1), 'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1), 'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1), 'dataframe-var')
    assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
sns.barplot(data=hour_no, x="mnth", y="cnt", ax=ax1)
ax1.set(xlabel="Month", ylabel="Average Count", title="Average Count By Month")


# Looking at the distribution of the rentals on a daily basis there are clearly periods of higher demand during a day, ie the moments people have to get home from/ go to work/school, and the other moments of the day.
# Of course there is a big seasonal effect as well as an effect of wheteher it is a working day or not.

# In[69]:


# Plot hourly distributions regarding season, day of week, workday
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
fig.set_size_inches(15, 18)

hourAggregated = dd.DataFrame(
    hour_no.groupby(["hr", "season"], sort=True)["cnt"].mean()
).reset_index()
sns.pointplot(
    x=hourAggregated["hr"],
    y=hourAggregated["cnt"],
    hue=hourAggregated["season"],
    data=hourAggregated,
    join=True,
    ax=ax1,
)
ax2.set(
    xlabel="Hour Of The Day",
    ylabel="Users Count",
    title="Average Users Count By Hour Of The Day Across Season",
    label="big",
)
예제 #29
0
def test_reductions():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)
        assert eq(dds.sum(), pds.sum())
        assert eq(dds.min(), pds.min())
        assert eq(dds.max(), pds.max())
        assert eq(dds.count(), pds.count())
        assert eq(dds.std(), pds.std())
        assert eq(dds.var(), pds.var())
        assert eq(dds.std(ddof=0), pds.std(ddof=0))
        assert eq(dds.var(ddof=0), pds.var(ddof=0))
        assert eq(dds.mean(), pds.mean())
        assert eq(dds.nunique(), pds.nunique())
        assert eq(dds.nbytes, pds.nbytes)

        assert eq(dds.sum(skipna=False), pds.sum(skipna=False))
        assert eq(dds.min(skipna=False), pds.min(skipna=False))
        assert eq(dds.max(skipna=False), pds.max(skipna=False))
        assert eq(dds.std(skipna=False), pds.std(skipna=False))
        assert eq(dds.var(skipna=False), pds.var(skipna=False))
        assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0))
        assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0))
        assert eq(dds.mean(skipna=False), pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(), 'series-sum')
    assert_dask_graph(ddf1.b.min(), 'series-min')
    assert_dask_graph(ddf1.b.max(), 'series-max')
    assert_dask_graph(ddf1.b.count(), 'series-count')
    assert_dask_graph(ddf1.b.std(), 'series-std')
    assert_dask_graph(ddf1.b.var(), 'series-var')
    assert_dask_graph(ddf1.b.std(ddof=0), 'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0), 'series-var')
    assert_dask_graph(ddf1.b.mean(), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')

    eq(ddf1.index.min(), pdf1.index.min())
    eq(ddf1.index.max(), pdf1.index.max())