示例#1
0
def test_parquet(s3, engine):
    dd = pytest.importorskip("dask.dataframe")
    from dask.dataframe._compat import tm

    lib = pytest.importorskip(engine)
    if engine == "pyarrow" and LooseVersion(lib.__version__) < "0.13.1":
        pytest.skip("pyarrow < 0.13.1 not supported for parquet")
    import pandas as pd
    import numpy as np

    url = "s3://%s/test.parquet" % test_bucket_name

    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([u"hello", u"you", u"people"], size=1000).astype(
                "O"
            ),
        },
        index=pd.Index(np.arange(1000), name="foo"),
    )
    df = dd.from_pandas(data, chunksize=500)
    df.to_parquet(url, engine=engine)

    files = [f.split("/")[-1] for f in s3.ls(url)]
    assert "_common_metadata" in files
    assert "part.0.parquet" in files

    df2 = dd.read_parquet(url, index="foo", engine=engine)
    assert len(df2.divisions) > 1

    tm.assert_frame_equal(data, df2.compute())
示例#2
0
def assert_eq(
    a,
    b,
    check_names=True,
    check_dtype=True,
    check_divisions=True,
    check_index=True,
    scheduler="sync",
    **kwargs,
):
    if check_divisions:
        assert_divisions(a, scheduler=scheduler)
        assert_divisions(b, scheduler=scheduler)
        if hasattr(a, "divisions") and hasattr(b, "divisions"):
            at = type(np.asarray(a.divisions).tolist()[0])  # numpy to python
            bt = type(np.asarray(b.divisions).tolist()[0])  # scalar conversion
            assert at == bt, (at, bt)
    assert_sane_keynames(a)
    assert_sane_keynames(b)
    a = _check_dask(a,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    b = _check_dask(b,
                    check_names=check_names,
                    check_dtypes=check_dtype,
                    scheduler=scheduler)
    if hasattr(a, "to_pandas"):
        a = a.to_pandas()
    if hasattr(b, "to_pandas"):
        b = b.to_pandas()
    if isinstance(a, (pd.DataFrame, pd.Series)):
        a = _maybe_sort(a, check_index)
        b = _maybe_sort(b, check_index)
    if not check_index:
        a = a.reset_index(drop=True)
        b = b.reset_index(drop=True)
    if isinstance(a, pd.DataFrame):
        tm.assert_frame_equal(a,
                              b,
                              check_names=check_names,
                              check_dtype=check_dtype,
                              **kwargs)
    elif isinstance(a, pd.Series):
        tm.assert_series_equal(a,
                               b,
                               check_names=check_names,
                               check_dtype=check_dtype,
                               **kwargs)
    elif isinstance(a, pd.Index):
        tm.assert_index_equal(a, b, exact=check_dtype, **kwargs)
    else:
        if a == b:
            return True
        else:
            if np.isnan(a):
                assert np.isnan(b)
            else:
                assert np.allclose(a, b)
    return True
示例#3
0
def test_getitem():
    df = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
            "B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
            "C": [True, False, True] * 3,
        },
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf["A"], df["A"])
    # check cache consistency
    tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"])

    assert_eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]])

    assert_eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    pytest.raises(KeyError, lambda: df["X"])
    pytest.raises(KeyError, lambda: df[["A", "X"]])
    pytest.raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf[0], df[0])
    assert_eq(ddf[[1, 2]], df[[1, 2]])

    pytest.raises(KeyError, lambda: df[8])
    pytest.raises(KeyError, lambda: df[[1, 8]])
示例#4
0
def test_hdf_globbing():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )

    with tmpdir() as tdir:
        df.to_hdf(os.path.join(tdir, "one.h5"), "/foo/data", format="table")
        df.to_hdf(os.path.join(tdir, "two.h5"), "/bar/data", format="table")
        df.to_hdf(os.path.join(tdir, "two.h5"), "/foo/data", format="table")

        with dask.config.set(scheduler="sync"):
            res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2)
            assert res.npartitions == 2
            tm.assert_frame_equal(res.compute(), df)

            res = dd.read_hdf(
                os.path.join(tdir, "one.h5"), "/*/data", chunksize=2, start=1, stop=3
            )
            expected = pd.read_hdf(
                os.path.join(tdir, "one.h5"), "/foo/data", start=1, stop=3
            )
            tm.assert_frame_equal(res.compute(), expected)

            res = dd.read_hdf(os.path.join(tdir, "two.h5"), "/*/data", chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/foo/data", chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/*/data", chunksize=2)
            assert res.npartitions == 2 + 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
示例#5
0
def test_meta_duplicated():
    df = pd.DataFrame(columns=["A", "A", "B"])
    res = meta_nonempty(df)

    exp = pd.DataFrame(
        [["foo", "foo", "foo"], ["foo", "foo", "foo"]],
        index=["a", "b"],
        columns=["A", "A", "B"],
    )
    tm.assert_frame_equal(res, exp)
示例#6
0
def test_to_csv_gzip():
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpfile("csv") as fn:
            a.to_csv(fn, compression="gzip")
            result = pd.read_csv(fn, index_col=0, compression="gzip")
            tm.assert_frame_equal(result, df)
示例#7
0
文件: test_s3.py 项目: trentwatt/dask
def test_parquet(s3, engine, s3so, metadata_file):
    import s3fs

    dd = pytest.importorskip("dask.dataframe")
    pd = pytest.importorskip("pandas")
    np = pytest.importorskip("numpy")
    from dask.dataframe._compat import tm

    lib = pytest.importorskip(engine)
    if engine == "pyarrow" and LooseVersion(lib.__version__) < "0.13.1":
        pytest.skip("pyarrow < 0.13.1 not supported for parquet")
    if (engine == "pyarrow" and LooseVersion(lib.__version__) >= "2.0"
            and LooseVersion(lib.__version__) < "3.0"
            and LooseVersion(s3fs.__version__) > "0.5.0"):
        pytest.skip("#7056 - new s3fs not supported before pyarrow 3.0")

    url = "s3://%s/test.parquet" % test_bucket_name

    data = pd.DataFrame(
        {
            "i32":
            np.arange(1000, dtype=np.int32),
            "i64":
            np.arange(1000, dtype=np.int64),
            "f":
            np.arange(1000, dtype=np.float64),
            "bhello":
            np.random.choice(["hello", "you", "people"],
                             size=1000).astype("O"),
        },
        index=pd.Index(np.arange(1000), name="foo"),
    )
    df = dd.from_pandas(data, chunksize=500)
    df.to_parquet(url,
                  engine=engine,
                  storage_options=s3so,
                  write_metadata_file=metadata_file)

    files = [f.split("/")[-1] for f in s3.ls(url)]
    if metadata_file:
        assert "_common_metadata" in files
        assert "_metadata" in files
    assert "part.0.parquet" in files

    df2 = dd.read_parquet(url,
                          index="foo",
                          gather_statistics=True,
                          engine=engine,
                          storage_options=s3so)
    assert len(df2.divisions) > 1

    tm.assert_frame_equal(data, df2.compute())
示例#8
0
def test_to_hdf_kwargs():
    pytest.importorskip("tables")
    df = pd.DataFrame({"A": ["a", "aaaa"]})
    ddf = dd.from_pandas(df, npartitions=2)
    with tmpfile("h5") as fn:
        ddf.to_hdf(fn, "foo4", format="table", min_itemsize=4)
        df2 = pd.read_hdf(fn, "foo4")
        tm.assert_frame_equal(df, df2)

    # test shorthand 't' for table
    with tmpfile("h5") as fn:
        ddf.to_hdf(fn, "foo4", format="t", min_itemsize=4)
        df2 = pd.read_hdf(fn, "foo4")
        tm.assert_frame_equal(df, df2)
示例#9
0
def test_gh_2730():
    large = pd.DataFrame({"KEY": np.arange(0, 50000)})
    small = pd.DataFrame({"KEY": np.arange(25, 500)})

    dd_left = dd.from_pandas(small, npartitions=3)
    dd_right = dd.from_pandas(large, npartitions=257)

    with dask.config.set(shuffle="tasks", scheduler="sync"):
        dd_merged = dd_left.merge(dd_right, how="inner", on="KEY")
        result = dd_merged.compute()

    expected = large.merge(small, how="inner", on="KEY")

    tm.assert_frame_equal(result.sort_values("KEY").reset_index(drop=True), expected)
示例#10
0
def test_hdf_file_list():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )

    with tmpdir() as tdir:
        df.iloc[:2].to_hdf(os.path.join(tdir, "one.h5"), "dataframe", format="table")
        df.iloc[2:].to_hdf(os.path.join(tdir, "two.h5"), "dataframe", format="table")

        with dask.config.set(scheduler="sync"):
            input_files = [os.path.join(tdir, "one.h5"), os.path.join(tdir, "two.h5")]
            res = dd.read_hdf(input_files, "dataframe")
            tm.assert_frame_equal(res.compute(), df)
示例#11
0
def test_concat_unions_categoricals():
    # Categorical DataFrame, regular index
    tm.assert_frame_equal(_concat(frames), pd.concat(frames2))

    # Categorical Series, regular index
    tm.assert_series_equal(_concat([i.y for i in frames]),
                           pd.concat([i.y for i in frames2]))

    # Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames3]),
                          pd.concat([i for i in frames4]).index)

    # Categorical DataFrame, Categorical Index
    tm.assert_frame_equal(_concat(frames3), pd.concat(frames4))

    # Non-categorical DataFrame, Categorical Index
    tm.assert_frame_equal(
        _concat([i[["x", "z"]] for i in frames3]),
        pd.concat([i[["x", "z"]] for i in frames4]),
    )

    # Categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.z for i in frames3]),
                           pd.concat([i.z for i in frames4]))

    # Non-categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.x for i in frames3]),
                           pd.concat([i.x for i in frames4]))

    # MultiIndex with Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames5]),
                          pd.concat([i for i in frames6]).index)

    # DataFrame, MultiIndex with CategoricalIndex
    tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
示例#12
0
def test_to_hdf():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)

    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])

    with tmpfile("h5") as fn:
        a.x.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])

    # test compute = False
    with tmpfile("h5") as fn:
        r = a.to_hdf(fn, "/data", compute=False)
        r.compute()
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])
示例#13
0
文件: test_io.py 项目: aparwal7/6242
def test_from_pandas_dataframe():
    a = list("aaaaaaabbbbbbbbccccccc")
    df = pd.DataFrame(
        dict(a=a, b=np.random.randn(len(a))),
        index=pd.date_range(start="20120101", periods=len(a)),
    )
    ddf = dd.from_pandas(df, 3)
    assert len(ddf.dask) == 3
    assert len(ddf.divisions) == len(ddf.dask) + 1
    assert isinstance(ddf.divisions[0], type(df.index[0]))
    tm.assert_frame_equal(df, ddf.compute())
    ddf = dd.from_pandas(df, chunksize=8)
    msg = "Exactly one of npartitions and chunksize must be specified."
    with pytest.raises(ValueError) as err:
        dd.from_pandas(df, npartitions=2, chunksize=2)
    assert msg in str(err.value)
    with pytest.raises((ValueError, AssertionError)) as err:
        dd.from_pandas(df)
    assert msg in str(err.value)
    assert len(ddf.dask) == 3
    assert len(ddf.divisions) == len(ddf.dask) + 1
    assert isinstance(ddf.divisions[0], type(df.index[0]))
    tm.assert_frame_equal(df, ddf.compute())
示例#14
0
def test_to_hdf_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # saving to multiple nodes making sure order is kept
    with tmpfile("h5") as fn:
        b.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple datasets with custom name_function
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data_*")
        assert_eq(df, out)

        out = pd.read_hdf(fn, "/data_a")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, "/data_aa")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test multiple nodes with hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            b.to_hdf(hdf, "/data*")
            out = dd.read_hdf(fn, "/data*")
            assert_eq(df16, out)
示例#15
0
文件: test_demo.py 项目: DWesl/dask
def test_make_timeseries():
    df = dd.demo.make_timeseries("2000",
                                 "2015", {
                                     "A": float,
                                     "B": int,
                                     "C": str
                                 },
                                 freq="2D",
                                 partition_freq="6M")

    assert df.divisions[0] == pd.Timestamp("2000-01-31", freq="6M")
    assert df.divisions[-1] == pd.Timestamp("2014-07-31", freq="6M")
    tm.assert_index_equal(df.columns, pd.Index(["A", "B", "C"]))
    assert df["A"].head().dtype == float
    assert df["B"].head().dtype == int
    assert df["C"].head().dtype == object
    assert df.index.name == "timestamp"
    assert df.head().index.name == df.index.name
    assert df.divisions == tuple(
        pd.date_range(start="2000", end="2015", freq="6M"))

    tm.assert_frame_equal(df.head(), df.head())

    a = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=123,
    )
    b = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=123,
    )
    c = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="6M",
        seed=456,
    )
    d = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="2D",
        partition_freq="3M",
        seed=123,
    )
    e = dd.demo.make_timeseries(
        "2000",
        "2015",
        {
            "A": float,
            "B": int,
            "C": str
        },
        freq="1D",
        partition_freq="6M",
        seed=123,
    )
    tm.assert_frame_equal(a.head(), b.head())
    assert not (a.head(10) == c.head(10)).all().all()
    assert a._name == b._name
    assert a._name != c._name
    assert a._name != d._name
    assert a._name != e._name
示例#16
0
文件: test_hdf.py 项目: m-rossi/dask
def test_to_hdf_multiple_files():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # saving to multiple files making sure order is kept
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        b.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df16, out)

    # saving to multiple files where first file is longer
    # https://github.com/dask/dask/issues/8023
    with tmpdir() as dn:
        fn1 = os.path.join(dn, "data_1.h5")
        fn2 = os.path.join(dn, "data_2.h5")
        b.to_hdf(fn1, "/data")
        a.to_hdf(fn2, "/data")
        out = dd.read_hdf([fn1, fn2], "/data")
        assert_eq(pd.concat([df16, df]), out)

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

        out = pd.read_hdf(os.path.join(dn, "data_a.h5"), "/data")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, "data_aa.h5"), "/data")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)
示例#17
0
def test_to_hdf_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # saving to multiple nodes making sure order is kept
    with tmpfile("h5") as fn:
        b.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple datasets with custom name_function
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data_*")
        assert_eq(df, out)

        out = pd.read_hdf(fn, "/data_a")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, "/data_aa")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test multiple nodes with hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            b.to_hdf(hdf, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # Test getitem optimization
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")[["x"]]
        dsk = optimize_dataframe_getitem(out.dask, keys=out.__dask_keys__())
        read = [key for key in dsk.layers if key.startswith("read-hdf")][0]
        subgraph = dsk.layers[read]
        assert isinstance(subgraph, DataFrameIOLayer)
        assert subgraph.columns == ["x"]