def test_concat_unions_categoricals(): # Categorical DataFrame, regular index tm.assert_frame_equal(_concat(frames), pd.concat(frames2)) # Categorical Series, regular index tm.assert_series_equal(_concat([i.y for i in frames]), pd.concat([i.y for i in frames2])) # Categorical Index tm.assert_index_equal(_concat([i.index for i in frames3]), pd.concat([i for i in frames4]).index) # Categorical DataFrame, Categorical Index tm.assert_frame_equal(_concat(frames3), pd.concat(frames4)) # Non-categorical DataFrame, Categorical Index tm.assert_frame_equal( _concat([i[["x", "z"]] for i in frames3]), pd.concat([i[["x", "z"]] for i in frames4]), ) # Categorical Series, Categorical Index tm.assert_series_equal(_concat([i.z for i in frames3]), pd.concat([i.z for i in frames4])) # Non-categorical Series, Categorical Index tm.assert_series_equal(_concat([i.x for i in frames3]), pd.concat([i.x for i in frames4])) # MultiIndex with Categorical Index tm.assert_index_equal(_concat([i.index for i in frames5]), pd.concat([i for i in frames6]).index) # DataFrame, MultiIndex with CategoricalIndex tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
def test_get_dummies(data): exp = pd.get_dummies(data) ddata = dd.from_pandas(data, 2) res = dd.get_dummies(ddata) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns)
def assert_eq( a, b, check_names=True, check_dtype=True, check_divisions=True, check_index=True, scheduler="sync", **kwargs, ): if check_divisions: assert_divisions(a, scheduler=scheduler) assert_divisions(b, scheduler=scheduler) if hasattr(a, "divisions") and hasattr(b, "divisions"): at = type(np.asarray(a.divisions).tolist()[0]) # numpy to python bt = type(np.asarray(b.divisions).tolist()[0]) # scalar conversion assert at == bt, (at, bt) assert_sane_keynames(a) assert_sane_keynames(b) a = _check_dask(a, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) b = _check_dask(b, check_names=check_names, check_dtypes=check_dtype, scheduler=scheduler) if hasattr(a, "to_pandas"): a = a.to_pandas() if hasattr(b, "to_pandas"): b = b.to_pandas() if isinstance(a, (pd.DataFrame, pd.Series)): a = _maybe_sort(a, check_index) b = _maybe_sort(b, check_index) if not check_index: a = a.reset_index(drop=True) b = b.reset_index(drop=True) if isinstance(a, pd.DataFrame): tm.assert_frame_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Series): tm.assert_series_equal(a, b, check_names=check_names, check_dtype=check_dtype, **kwargs) elif isinstance(a, pd.Index): tm.assert_index_equal(a, b, exact=check_dtype, **kwargs) else: if a == b: return True else: if np.isnan(a): assert np.isnan(b) else: assert np.allclose(a, b) return True
def test_categorical_known(): text1 = normalize_text(""" A,B a,a b,b a,a """) text2 = normalize_text(""" A,B a,a b,b c,c """) dtype = pd.api.types.CategoricalDtype(["a", "b", "c"]) with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}): result = dd.read_csv("foo.*.csv", dtype={ "A": "category", "B": "category" }) assert result.A.cat.known is False assert result.B.cat.known is False expected = pd.DataFrame( { "A": pd.Categorical(["a", "b", "a", "a", "b", "c"], categories=dtype.categories), "B": pd.Categorical(["a", "b", "a", "a", "b", "c"], categories=dtype.categories), }, index=[0, 1, 2, 0, 1, 2], ) assert_eq(result, expected) # Specify a dtype result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"}) assert result.A.cat.known is True assert result.B.cat.known is False tm.assert_index_equal(result.A.cat.categories, dtype.categories) assert result.A.cat.ordered is False assert_eq(result, expected) # ordered dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=True) result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"}) expected["A"] = expected["A"].cat.as_ordered() assert result.A.cat.known is True assert result.B.cat.known is False assert result.A.cat.ordered is True assert_eq(result, expected) # Specify "unknown" categories result = dd.read_csv("foo.*.csv", dtype=pd.api.types.CategoricalDtype()) assert result.A.cat.known is False result = dd.read_csv("foo.*.csv", dtype="category") assert result.A.cat.known is False
def test_from_dask_array_compat_numpy_array_1d(): x = da.ones(10, chunks=3) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name is None d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name is None d1 = dd.from_dask_array(x, columns="name") # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name == "name" d2 = dd.from_array(x.compute(), columns="name") # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name == "name" # passing list via columns results in DataFrame d1 = dd.from_dask_array(x, columns=["name"]) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(["name"])) d2 = dd.from_array(x.compute(), columns=["name"]) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(["name"]))
def test_unknown_categories(self, series): a, da = series assert da.cat.known da = da.cat.as_unknown() assert not da.cat.known with pytest.raises(NotImplementedError, match="with unknown categories"): da.cat.categories with pytest.raises(NotImplementedError, match="with unknown categories"): da.cat.codes # Also AttributeError so glob searching in IPython such as `da.cat.*?` works with pytest.raises(AttributeError, match="with unknown categories"): da.cat.categories with pytest.raises(AttributeError, match="with unknown categories"): da.cat.codes db = da.cat.set_categories(["a", "b", "c"]) assert db.cat.known tm.assert_index_equal(db.cat.categories, get_cat(a).categories) assert_array_index_eq(db.cat.codes, get_cat(a).codes) db = da.cat.as_known() assert db.cat.known res = db.compute() tm.assert_index_equal(db.cat.categories, get_cat(res).categories) assert_array_index_eq(db.cat.codes, get_cat(res).codes)
def test_from_dask_array_struct_dtype(): x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")]) y = da.from_array(x, chunks=(1, )) df = dd.from_dask_array(y) tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) assert_eq(df, pd.DataFrame(x)) assert_eq(dd.from_dask_array(y, columns=["b", "a"]), pd.DataFrame(x, columns=["b", "a"]))
def test_from_dask_array_compat_numpy_array(): x = da.ones((3, 3, 3), chunks=2) with pytest.raises(ValueError): dd.from_dask_array(x) # dask with pytest.raises(ValueError): dd.from_array(x.compute()) # numpy x = da.ones((10, 3), chunks=(3, 3)) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2])) d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2])) with pytest.raises(ValueError): dd.from_dask_array(x, columns=["a"]) # dask with pytest.raises(ValueError): dd.from_array(x.compute(), columns=["a"]) # numpy d1 = dd.from_dask_array(x, columns=["a", "b", "c"]) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(["a", "b", "c"])) d2 = dd.from_array(x.compute(), columns=["a", "b", "c"]) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(["a", "b", "c"]))
def test_get_dummies_kwargs(): s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype="category") exp = pd.get_dummies(s, prefix="X", prefix_sep="-") ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, prefix="X", prefix_sep="-") assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index(["X-1", "X-2", "X-3", "X-4"])) exp = pd.get_dummies(s, drop_first=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, drop_first=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # nan s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype="category") exp = pd.get_dummies(s) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # dummy_na exp = pd.get_dummies(s, dummy_na=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, dummy_na=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = dd.from_dask_array(x, ["a", "b", "c"]) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df.columns, pd.Index(["a", "b", "c"])) assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(scheduler="sync").values == x.compute(scheduler="sync")).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=["a", "b", "c"]) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df2.columns, df.columns) assert df2.divisions == df.divisions
def test_from_array(): x = np.arange(10 * 3).reshape(10, 3) d = dd.from_array(x, chunksize=4) assert isinstance(d, dd.DataFrame) tm.assert_index_equal(d.columns, pd.Index([0, 1, 2])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() d = dd.from_array(x, chunksize=4, columns=list("abc")) assert isinstance(d, dd.DataFrame) tm.assert_index_equal(d.columns, pd.Index(["a", "b", "c"])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() with pytest.raises(ValueError): dd.from_array(np.ones(shape=(10, 10, 10)))
def test_meta_from_recarray(): x = np.array([(i, i * 10) for i in range(10)], dtype=[("a", np.float64), ("b", np.int64)]) res = _meta_from_array(x) assert isinstance(res, pd.DataFrame) assert res["a"].dtype == np.float64 assert res["b"].dtype == np.int64 tm.assert_index_equal(res.columns, pd.Index(["a", "b"])) res = _meta_from_array(x, columns=["b", "a"]) assert isinstance(res, pd.DataFrame) assert res["a"].dtype == np.float64 assert res["b"].dtype == np.int64 tm.assert_index_equal(res.columns, pd.Index(["b", "a"])) with pytest.raises(ValueError): _meta_from_array(x, columns=["a", "b", "c"])
def test_meta_from_1darray(): x = np.array([1.0, 2.0, 3.0], dtype=np.float64) res = _meta_from_array(x) assert isinstance(res, pd.Series) assert res.dtype == np.float64 x = np.array([1, 2, 3], dtype=np.object_) res = _meta_from_array(x, columns="x") assert isinstance(res, pd.Series) assert res.name == "x" assert res.dtype == np.object_ x = np.array([1, 2, 3], dtype=np.object_) res = _meta_from_array(x, columns=["x"]) assert isinstance(res, pd.DataFrame) assert res["x"].dtype == np.object_ tm.assert_index_equal(res.columns, pd.Index(["x"])) with pytest.raises(ValueError): _meta_from_array(x, columns=["a", "b"])
def test_unknown_categories(self, series): a, da = series assert da.cat.known da = da.cat.as_unknown() assert not da.cat.known with pytest.raises(NotImplementedError): da.cat.categories with pytest.raises(NotImplementedError): da.cat.codes db = da.cat.set_categories(["a", "b", "c"]) assert db.cat.known tm.assert_index_equal(db.cat.categories, get_cat(a).categories) assert_array_index_eq(db.cat.codes, get_cat(a).codes) db = da.cat.as_known() assert db.cat.known res = db.compute() tm.assert_index_equal(db.cat.categories, get_cat(res).categories) assert_array_index_eq(db.cat.codes, get_cat(res).codes)
def test_meta_from_array(): x = np.array([[1, 2], [3, 4]], dtype=np.int64) res = _meta_from_array(x) assert isinstance(res, pd.DataFrame) assert res[0].dtype == np.int64 assert res[1].dtype == np.int64 tm.assert_index_equal(res.columns, pd.Index([0, 1])) x = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float64) res = _meta_from_array(x, columns=["a", "b"]) assert isinstance(res, pd.DataFrame) assert res["a"].dtype == np.float64 assert res["b"].dtype == np.float64 tm.assert_index_equal(res.columns, pd.Index(["a", "b"])) with pytest.raises(ValueError): _meta_from_array(x, columns=["a", "b", "c"]) np.random.seed(42) x = np.random.rand(201, 2) x = dd.from_array(x, chunksize=50, columns=["a", "b"]) assert len(x.divisions) == 6 # Should be 5 partitions and the end
def test_get_dummies_object(): df = pd.DataFrame({ "a": pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), "b": list("abcdabcd"), "c": pd.Categorical(list("abcdabcd")), }) ddf = dd.from_pandas(df, 2) # Explicitly exclude object columns exp = pd.get_dummies(df, columns=["a", "c"]) res = dd.get_dummies(ddf, columns=["a", "c"]) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.b) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=["b"])
def test_make_timeseries(): df = dd.demo.make_timeseries("2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M") assert df.divisions[0] == pd.Timestamp("2000-01-31", freq="6M") assert df.divisions[-1] == pd.Timestamp("2014-07-31", freq="6M") tm.assert_index_equal(df.columns, pd.Index(["A", "B", "C"])) assert df["A"].head().dtype == float assert df["B"].head().dtype == int assert df["C"].head().dtype == object assert df.index.name == "timestamp" assert df.head().index.name == df.index.name assert df.divisions == tuple( pd.date_range(start="2000", end="2015", freq="6M")) tm.assert_frame_equal(df.head(), df.head()) a = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=123, ) b = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=123, ) c = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="6M", seed=456, ) d = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="2D", partition_freq="3M", seed=123, ) e = dd.demo.make_timeseries( "2000", "2015", { "A": float, "B": int, "C": str }, freq="1D", partition_freq="6M", seed=123, ) tm.assert_frame_equal(a.head(), b.head()) assert not (a.head(10) == c.head(10)).all().all() assert a._name == b._name assert a._name != c._name assert a._name != d._name assert a._name != e._name
def _check_dask(dsk, check_names=True, check_dtypes=True, result=None, scheduler=None): import dask.dataframe as dd if hasattr(dsk, "__dask_graph__"): graph = dsk.__dask_graph__() if hasattr(graph, "validate"): graph.validate() if result is None: result = dsk.compute(scheduler=scheduler) if isinstance(dsk, dd.Index): assert "Index" in type(result).__name__, type(result) # assert type(dsk._meta) == type(result), type(dsk._meta) if check_names: assert dsk.name == result.name assert dsk._meta.name == result.name if isinstance(result, pd.MultiIndex): assert result.names == dsk._meta.names if check_dtypes: assert_dask_dtypes(dsk, result) elif isinstance(dsk, dd.Series): assert "Series" in type(result).__name__, type(result) assert type(dsk._meta) == type(result), type(dsk._meta) if check_names: assert dsk.name == result.name, (dsk.name, result.name) assert dsk._meta.name == result.name if check_dtypes: assert_dask_dtypes(dsk, result) _check_dask( dsk.index, check_names=check_names, check_dtypes=check_dtypes, result=result.index, ) elif isinstance(dsk, dd.DataFrame): assert "DataFrame" in type(result).__name__, type(result) assert isinstance(dsk.columns, pd.Index), type(dsk.columns) assert type(dsk._meta) == type(result), type(dsk._meta) if check_names: tm.assert_index_equal(dsk.columns, result.columns) tm.assert_index_equal(dsk._meta.columns, result.columns) if check_dtypes: assert_dask_dtypes(dsk, result) _check_dask( dsk.index, check_names=check_names, check_dtypes=check_dtypes, result=result.index, ) elif isinstance(dsk, dd.core.Scalar): assert np.isscalar(result) or isinstance( result, (pd.Timestamp, pd.Timedelta)) if check_dtypes: assert_dask_dtypes(dsk, result) else: msg = f"Unsupported dask instance {type(dsk)} found" raise AssertionError(msg) return result return dsk