def test_handle_zeros_in_scale(): s2 = handle_zeros_in_scale(s) a2 = handle_zeros_in_scale(a) assert list(s2.compute()) == [1, 1, 2, 3, 1] assert list(a2.compute()) == [1, 1, 2, 3, 1] x = np.array([1, 2, 3, 0], dtype="f8") expected = np.array([1, 2, 3, 1], dtype="f8") result = handle_zeros_in_scale(x) np.testing.assert_array_equal(result, expected) x = pd.Series(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) tm.assert_series_equal(result, expected) x = da.from_array(x.values, chunks=2) expected = expected.values result = handle_zeros_in_scale(x) assert_eq_ar(result, expected) x = dd.from_dask_array(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) assert_eq_df(result, expected)
def test_slice_columns(): columns = [2, 3] df2 = slice_columns(df, columns) X2 = slice_columns(X, columns) assert list(df2.columns) == columns assert_eq_df(df[columns].compute(), df2.compute()) assert_eq_ar(X.compute(), X2.compute())
def test_transform(self, array): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(array) b.fit(array.compute()) assert_eq_ar(a.transform(array).compute(), b.transform(array.compute()))
def test_array_transform(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() res_a = a.fit_transform(X) res_b = b.fit_transform(X.compute()) assert_estimator_equal(a, b) assert dask.is_dask_collection(res_a) assert_eq_ar(res_a, res_b)
def test_fit_transform_frame(self): df = pd.DataFrame(np.random.randn(1000, 3)) ddf = dd.from_pandas(df, 2) a = spp.QuantileTransformer() b = dpp.QuantileTransformer() expected = a.fit_transform(df) result = b.fit_transform(ddf) assert_eq_ar(result, expected, rtol=1e-3, atol=1e-3)
def test_transform_array(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() # pass numpy array to fit_transform res_a1 = a.fit_transform(X.compute()) # pass dask array to fit_transform res_a2 = a.fit_transform(X).compute() res_b = b.fit_transform(X.compute()) assert_eq_ar(res_a1, res_b) assert_eq_ar(res_a2, res_b)
def test_df_column_slice(self): mask = ["3", "4"] mask_ix = [mask.index(x) for x in mask] a = dpp.MinMaxScaler(columns=mask) b = spp.MinMaxScaler() dfa = a.fit_transform(df2).compute() mxb = b.fit_transform(df2.compute()) assert isinstance(dfa, pd.DataFrame) assert_eq_ar(dfa[mask].values, mxb[:, mask_ix]) assert_eq_df(dfa.drop(mask, axis=1), df2.drop(mask, axis=1).compute())
def _assert_eq(l, r, **kwargs): array_types = (np.ndarray, da.Array) frame_types = (pd.core.generic.NDFrame, dd._Frame) if isinstance(l, array_types): assert_eq_ar(l, r, **kwargs) elif isinstance(l, frame_types): assert_eq_df(l, r, **kwargs) elif isinstance(l, Sequence) and any( isinstance(x, array_types + frame_types) for x in l): for a, b in zip(l, r): _assert_eq(a, b, **kwargs) else: assert l == r
def test_transform(self): a = dpp.RobustScaler() b = spp.RobustScaler() a.fit(X) b.fit(X.compute()) # overwriting dask-ml's fitted attributes to have them exactly equal # (the approximate equality is tested above) a.scale_ = b.scale_ a.center_ = b.center_ assert_eq_ar(a.transform(X).compute(), b.transform(X.compute()))
def test_basic(self): rs = da.random.RandomState(0) a = dpp.QuantileTransformer() b = spp.QuantileTransformer() X = rs.uniform(size=(100, 3), chunks=50) a.fit(X) b.fit(X) assert_estimator_equal(a, b, atol=.02) # set the quantiles, so that from here out, we're exact a.quantiles_ = b.quantiles_ assert_eq_ar(a.transform(X), b.transform(X)) assert_eq_ar(X, a.inverse_transform(a.transform(X)))
def test_df_values(self): est1 = dpp.MinMaxScaler() est2 = dpp.MinMaxScaler() result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) for attr in [ 'data_min_', 'data_max_', 'data_range_', 'scale_', 'min_' ]: assert_eq_ar(getattr(est1, attr), getattr(est2, attr).values) assert_eq_ar(est1.transform(X), est2.transform(X)) assert_eq_ar(est1.transform(df).values, est2.transform(X)) assert_eq_ar(est1.transform(X), est2.transform(df).values) assert_eq_ar(result_ar, result_df.values)
def test_df_values(self): est1 = dpp.MinMaxScaler() est2 = dpp.MinMaxScaler() result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) for attr in ["data_min_", "data_max_", "data_range_", "scale_", "min_"]: assert_eq_ar(getattr(est1, attr), getattr(est2, attr).values) assert_eq_ar(est1.transform(X), est2.transform(X)) assert_eq_ar(est1.transform(df).values, est2.transform(X)) assert_eq_ar(est1.transform(X), est2.transform(df).values) if hasattr(result_df, "values"): result_df = result_df.values assert_eq_ar(result_ar, result_df)
def test_df_transform(self, daskify): frame = df if not daskify: frame = frame.compute() a = dpp.PolynomialFeatures(preserve_dataframe=True) b = dpp.PolynomialFeatures() c = spp.PolynomialFeatures() res_df = a.fit_transform(frame) res_arr = b.fit_transform(frame) res_c = c.fit_transform(frame) if daskify: res_pandas = a.fit_transform(frame.compute()) assert dask.is_dask_collection(res_df) assert dask.is_dask_collection(res_arr) assert_eq_df(res_df.compute().reset_index(drop=True), res_pandas) assert_eq_ar(res_df.values, res_c) assert_eq_ar(res_df.values, res_arr)
def test_inverse_transform(self): a = dpp.MinMaxScaler() assert_eq_ar( a.inverse_transform(a.fit_transform(X)).compute(), X.compute())
def test_df_values(self): est1 = dpp.RobustScaler() est2 = dpp.RobustScaler() result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) assert_eq_ar(result_ar, result_df.values) for attr in ['scale_', 'center_']: assert_eq_ar(getattr(est1, attr), getattr(est2, attr)) assert_eq_ar(est1.transform(X), est2.transform(X)) assert_eq_ar(est1.transform(df).values, est2.transform(X)) assert_eq_ar(est1.transform(X), est2.transform(df).values) # different data types df['0'] = df['0'].astype('float32') result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) assert_eq_ar(result_ar, result_df.values)
def test_inverse_transform(self, array): a = dpp.LabelEncoder() assert_eq_ar(a.inverse_transform(a.fit_transform(array)), da.asarray(array))
def test_df_values(self): a = dpp.MinMaxScaler() assert_eq_ar( a.fit_transform(X).compute(), a.fit_transform(df).compute().as_matrix())
def test_df_values(self): est1 = dpp.RobustScaler() est2 = dpp.RobustScaler() result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) if hasattr(result_df, "values"): result_df = result_df.values assert_eq_ar(result_ar, result_df) for attr in ["scale_", "center_"]: assert_eq_ar(getattr(est1, attr), getattr(est2, attr)) assert_eq_ar(est1.transform(X), est2.transform(X)) assert_eq_ar(est1.transform(df).values, est2.transform(X)) assert_eq_ar(est1.transform(X), est2.transform(df).values) # different data types df["0"] = df["0"].astype("float32") result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) if hasattr(result_df, "values"): result_df = result_df.values assert_eq_ar(result_ar, result_df)
def test_inverse_transform(self): a = dpp.StandardScaler() result = a.inverse_transform(a.fit_transform(X)) assert dask.is_dask_collection(result) assert_eq_ar(result, X)