예제 #1
0
def test_unknown_categoricals():
    ddf = dd.DataFrame(
        {("unknown", i): df
         for (i, df) in enumerate(frames)},
        "unknown",
        make_meta_util(
            {
                "v": "object",
                "w": "category",
                "x": "i8",
                "y": "category",
                "z": "f8"
            },
            parent_meta=frames[0],
        ),
        [None] * 4,
    )
    # Compute
    df = ddf.compute()

    assert_eq(ddf.w.value_counts(), df.w.value_counts())
    assert_eq(ddf.w.nunique(), df.w.nunique())

    assert_eq(ddf.groupby(ddf.w).sum(), df.groupby(df.w).sum())
    assert_eq(ddf.groupby(ddf.w).y.nunique(), df.groupby(df.w).y.nunique())
    assert_eq(ddf.y.groupby(ddf.w).count(), df.y.groupby(df.w).count())
예제 #2
0
def test_pivot_table_errors():
    df = pd.DataFrame({
        "A":
        np.random.choice(list("abc"), size=10),
        "B":
        np.random.randn(10),
        "C":
        pd.Categorical(np.random.choice(list("abc"), size=10)),
    })
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index=["A"], columns="C", values="B")
    assert msg in str(err.value)
    msg = "'columns' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns=["C"], values="B")
    assert msg in str(err.value)
    msg = "'values' must refer to an existing column or columns"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values=[["B"]])
    assert msg in str(err.value)

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf,
                       index="A",
                       columns="C",
                       values="B",
                       aggfunc=["sum"])
    assert msg in str(err.value)

    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc="xx")
    assert msg in str(err.value)

    # unknown categories
    ddf._meta = make_meta_util({
        "A": object,
        "B": float,
        "C": "category"
    },
                               parent_meta=pd.DataFrame())
    msg = "'columns' must have known categories"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values=["B"])
    assert msg in str(err.value)

    df = pd.DataFrame({
        "A": np.random.choice(list("abc"), size=10),
        "B": np.random.randn(10),
        "C": np.random.choice(list("abc"), size=10),
    })
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values="B")
    assert msg in str(err.value)
예제 #3
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)

    # unknown categories
    df = pd.DataFrame({"x": list("abcbc"), "y": list("bcbcb")})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf._meta = make_meta_util({
        "x": "category",
        "y": "category"
    },
                               parent_meta=pd.DataFrame())

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=["x", "y"])

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.x)
예제 #4
0
    ("x", 0): pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1, 4, 7]
    }, index=[0, 1, 3]),
    ("x", 1): pd.DataFrame({
        "a": [4, 5, 6],
        "b": [2, 5, 8]
    }, index=[5, 6, 8]),
    ("x", 2): pd.DataFrame({
        "a": [7, 8, 9],
        "b": [3, 6, 9]
    }, index=[9, 9, 9]),
}
meta = make_meta_util({
    "a": "i8",
    "b": "i8"
},
                      index=pd.Index([], "i8"),
                      parent_meta=pd.DataFrame())
d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if dd._compat.PANDAS_GT_110:
    CHECK_FREQ["check_freq"] = False

shuffle_func = shuffle  # conflicts with keyword argument


@pytest.mark.parametrize("shuffle", ["disk", "tasks"])
def test_shuffle(shuffle):
    s = shuffle_func(d, d.b, shuffle=shuffle)
    assert isinstance(s, dd.DataFrame)
예제 #5
0
def test_make_meta():
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": list("abc"),
        "c": [1.0, 2.0, 3.0]
    },
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta_util(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta_util(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta_util(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta_util(ddf) is ddf._meta

    # Dict
    meta = make_meta_util({
        "a": "i8",
        "b": "O",
        "c": "f8"
    },
                          parent_meta=pd.DataFrame())
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta_util([("a", "i8"), ("c", "f8"), ("b", "O")],
                          parent_meta=pd.DataFrame())
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta_util(("a", "i8"), parent_meta=pd.DataFrame())
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == "i8"
    assert meta.name == "a"

    # With index
    meta = make_meta_util(
        {
            "a": "i8",
            "b": "i4"
        },
        index=pd.Int64Index([1, 2], name="foo"),
        parent_meta=pd.DataFrame(),
    )
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta_util(("a", "i8"),
                          index=pd.Int64Index([1, 2], name="foo"),
                          parent_meta=pd.DataFrame())
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta_util({"a": "category"}, parent_meta=df)
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta_util(("a", "category"), parent_meta=df)
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta_util(np.float64(1.0), parent_meta=df)
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta_util(1.0, parent_meta=df)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta_util(x, parent_meta=df)
    assert meta is x

    # Dtype expressions
    meta = make_meta_util("i8", parent_meta=df)
    assert isinstance(meta, np.int64)
    meta = make_meta_util(float, parent_meta=df)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta_util(np.dtype("bool"), parent_meta=df)
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta_util(None))