def test_handle_zeros_in_scale(): s2 = handle_zeros_in_scale(s) a2 = handle_zeros_in_scale(a) assert list(s2.compute()) == [1, 1, 2, 3, 1] assert list(a2.compute()) == [1, 1, 2, 3, 1] x = np.array([1, 2, 3, 0], dtype="f8") expected = np.array([1, 2, 3, 1], dtype="f8") result = handle_zeros_in_scale(x) np.testing.assert_array_equal(result, expected) x = pd.Series(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) tm.assert_series_equal(result, expected) x = da.from_array(x.values, chunks=2) expected = expected.values result = handle_zeros_in_scale(x) assert_eq_ar(result, expected) x = dd.from_dask_array(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) assert_eq_df(result, expected)
def test_basic(self, daskify, values): de = dpp.OrdinalEncoder() df = dummy[["A", "D"]] if daskify: df = dd.from_pandas(df, 2) de = de.fit(df) trn = de.transform(df) expected = pd.DataFrame( { "A": np.array([0, 1, 2, 0], dtype="int8"), "D": np.array([1, 2, 3, 4]) }, columns=["A", "D"], ) assert_eq_df(trn, expected) if values: trn = trn.values result = de.inverse_transform(trn) if daskify: df = df.compute() result = result.compute() tm.assert_frame_equal(result, df)
def test_to_dataframe_optimize_graph(): pytest.importorskip("dask.dataframe") from dask.dataframe.utils import assert_eq as assert_eq_df x = db.from_sequence( [{"name": "test1", "v1": 1}, {"name": "test2", "v1": 2}], npartitions=2 ) # linear `map` tasks will be fused by graph optimization with dask.annotate(foo=True): y = x.map(lambda a: dict(**a, v2=a["v1"] + 1)) y = y.map(lambda a: dict(**a, v3=a["v2"] + 1)) y = y.map(lambda a: dict(**a, v4=a["v3"] + 1)) # verifying the maps are not fused yet assert len(y.dask) == y.npartitions * 4 # with optimizations d = y.to_dataframe() # All the `map` tasks have been fused assert len(d.dask) < len(y.dask) # no optimizations d2 = y.to_dataframe(optimize_graph=False) # Graph hasn't been fused. It contains all the original tasks, # plus one extra layer converting to DataFrame assert len(d2.dask) == len(y.dask) + d.npartitions # Annotations are still there assert hlg_layer_topological(d2.dask, 1).annotations == {"foo": True} assert_eq_df(d, d2)
def test_encode_subset_of_columns(self, daskify): de = dpp.DummyEncoder(columns=["B"]) df = dummy[["A", "B"]] if daskify: df = dd.from_pandas(df, 2) de = de.fit(df) trn = de.transform(df) expected = pd.DataFrame( { "A": pd.Categorical(["a", "b", "c", "a"], ordered=True), "B_a": np.array([1, 0, 0, 1], dtype="uint8"), "B_b": np.array([0, 1, 0, 0], dtype="uint8"), "B_c": np.array([0, 0, 1, 0], dtype="uint8"), }, columns=["A", "B_a", "B_b", "B_c"], ) assert_eq_df(trn, expected) result = de.inverse_transform(trn) if daskify: df = df.compute() result = result.compute() tm.assert_frame_equal(result, df)
def test_slice_columns(): columns = [2, 3] df2 = slice_columns(df, columns) X2 = slice_columns(X, columns) assert list(df2.columns) == columns assert_eq_df(df[columns].compute(), df2.compute()) assert_eq_ar(X.compute(), X2.compute())
def test_df_transform_index(self, daskify): frame = copy(df) if not daskify: frame = frame.compute() frame = frame.sample(frac=1.0) res_df = dpp.PolynomialFeatures(preserve_dataframe=True, degree=1).fit_transform(frame) assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)
def test_inverse_transform(self): de = dpp.DummyEncoder() df = dd.from_pandas( pd.DataFrame( {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)} ), npartitions=2, ) de.fit(df) assert_eq_df(df, de.inverse_transform(de.transform(df))) assert_eq_df(df, de.inverse_transform(de.transform(df).values))
def test_df_column_slice(self): mask = ["3", "4"] mask_ix = [mask.index(x) for x in mask] a = dpp.MinMaxScaler(columns=mask) b = spp.MinMaxScaler() dfa = a.fit_transform(df2).compute() mxb = b.fit_transform(df2.compute()) assert isinstance(dfa, pd.DataFrame) assert_eq_ar(dfa[mask].values, mxb[:, mask_ix]) assert_eq_df(dfa.drop(mask, axis=1), df2.drop(mask, axis=1).compute())
def _assert_eq(l, r, **kwargs): array_types = (np.ndarray, da.Array) frame_types = (pd.core.generic.NDFrame, dd._Frame) if isinstance(l, array_types): assert_eq_ar(l, r, **kwargs) elif isinstance(l, frame_types): assert_eq_df(l, r, **kwargs) elif isinstance(l, Sequence) and any( isinstance(x, array_types + frame_types) for x in l): for a, b in zip(l, r): _assert_eq(a, b, **kwargs) else: assert l == r
def test_df_transform(self, daskify): frame = df if not daskify: frame = frame.compute() a = dpp.PolynomialFeatures(preserve_dataframe=True) b = dpp.PolynomialFeatures() c = spp.PolynomialFeatures() res_df = a.fit_transform(frame) res_arr = b.fit_transform(frame) res_c = c.fit_transform(frame) if daskify: res_pandas = a.fit_transform(frame.compute()) assert dask.is_dask_collection(res_df) assert dask.is_dask_collection(res_arr) assert_eq_df(res_df.compute().reset_index(drop=True), res_pandas) assert_eq_ar(res_df.values, res_c) assert_eq_ar(res_df.values, res_arr)
def test_inverse_transform(self): enc = dpp.OrdinalEncoder() df = dd.from_pandas(pd.DataFrame({ "A": np.arange(10), "B": pd.Categorical(['a'] * 4 + ['b'] * 6) }), npartitions=2) enc.fit(df) assert_eq_df(df, enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df).compute())) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values)) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values.compute()))
def test_inverse_transform(self): enc = dpp.OrdinalEncoder() df = dd.from_pandas( pd.DataFrame( {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)} ), npartitions=2, ) enc.fit(df) assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df).values)) assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values)) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
def test_df_inverse_transform(self): mask = ["3", "4"] a = dpp.MinMaxScaler(columns=mask) assert_eq_df( a.inverse_transform(a.fit_transform(df2)).compute(), df2.compute())
def test_df_inverse_transform(self): mask = ["3", "4"] a = dpp.MinMaxScaler(columns=mask) result = a.inverse_transform(a.fit_transform(df2)) assert dask.is_dask_colelction(result) assert_eq_df(result, df2)