예제 #1
0
def test_handle_zeros_in_scale():
    s2 = handle_zeros_in_scale(s)
    a2 = handle_zeros_in_scale(a)

    assert list(s2.compute()) == [1, 1, 2, 3, 1]
    assert list(a2.compute()) == [1, 1, 2, 3, 1]

    x = np.array([1, 2, 3, 0], dtype="f8")
    expected = np.array([1, 2, 3, 1], dtype="f8")
    result = handle_zeros_in_scale(x)
    np.testing.assert_array_equal(result, expected)

    x = pd.Series(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    tm.assert_series_equal(result, expected)

    x = da.from_array(x.values, chunks=2)
    expected = expected.values
    result = handle_zeros_in_scale(x)
    assert_eq_ar(result, expected)

    x = dd.from_dask_array(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    assert_eq_df(result, expected)
예제 #2
0
def test_slice_columns():
    columns = [2, 3]
    df2 = slice_columns(df, columns)
    X2 = slice_columns(X, columns)

    assert list(df2.columns) == columns
    assert_eq_df(df[columns].compute(), df2.compute())
    assert_eq_ar(X.compute(), X2.compute())
예제 #3
0
    def test_transform(self, array):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(array)
        b.fit(array.compute())

        assert_eq_ar(a.transform(array).compute(), b.transform(array.compute()))
예제 #4
0
    def test_array_transform(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        res_a = a.fit_transform(X)
        res_b = b.fit_transform(X.compute())
        assert_estimator_equal(a, b)
        assert dask.is_dask_collection(res_a)
        assert_eq_ar(res_a, res_b)
예제 #5
0
    def test_fit_transform_frame(self):
        df = pd.DataFrame(np.random.randn(1000, 3))
        ddf = dd.from_pandas(df, 2)

        a = spp.QuantileTransformer()
        b = dpp.QuantileTransformer()

        expected = a.fit_transform(df)
        result = b.fit_transform(ddf)
        assert_eq_ar(result, expected, rtol=1e-3, atol=1e-3)
예제 #6
0
    def test_transform_array(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        # pass numpy array to fit_transform
        res_a1 = a.fit_transform(X.compute())
        # pass dask array to fit_transform
        res_a2 = a.fit_transform(X).compute()
        res_b = b.fit_transform(X.compute())
        assert_eq_ar(res_a1, res_b)
        assert_eq_ar(res_a2, res_b)
예제 #7
0
    def test_df_column_slice(self):
        mask = ["3", "4"]
        mask_ix = [mask.index(x) for x in mask]
        a = dpp.MinMaxScaler(columns=mask)
        b = spp.MinMaxScaler()

        dfa = a.fit_transform(df2).compute()
        mxb = b.fit_transform(df2.compute())

        assert isinstance(dfa, pd.DataFrame)
        assert_eq_ar(dfa[mask].values, mxb[:, mask_ix])
        assert_eq_df(dfa.drop(mask, axis=1), df2.drop(mask, axis=1).compute())
예제 #8
0
def _assert_eq(l, r, **kwargs):
    array_types = (np.ndarray, da.Array)
    frame_types = (pd.core.generic.NDFrame, dd._Frame)
    if isinstance(l, array_types):
        assert_eq_ar(l, r, **kwargs)
    elif isinstance(l, frame_types):
        assert_eq_df(l, r, **kwargs)
    elif isinstance(l, Sequence) and any(
            isinstance(x, array_types + frame_types) for x in l):
        for a, b in zip(l, r):
            _assert_eq(a, b, **kwargs)
    else:
        assert l == r
예제 #9
0
    def test_transform(self):
        a = dpp.RobustScaler()
        b = spp.RobustScaler()

        a.fit(X)
        b.fit(X.compute())

        # overwriting dask-ml's fitted attributes to have them exactly equal
        # (the approximate equality is tested above)
        a.scale_ = b.scale_
        a.center_ = b.center_

        assert_eq_ar(a.transform(X).compute(), b.transform(X.compute()))
예제 #10
0
    def test_basic(self):
        rs = da.random.RandomState(0)
        a = dpp.QuantileTransformer()
        b = spp.QuantileTransformer()

        X = rs.uniform(size=(100, 3), chunks=50)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(a, b, atol=.02)

        # set the quantiles, so that from here out, we're exact
        a.quantiles_ = b.quantiles_
        assert_eq_ar(a.transform(X), b.transform(X))
        assert_eq_ar(X, a.inverse_transform(a.transform(X)))
예제 #11
0
    def test_df_values(self):
        est1 = dpp.MinMaxScaler()
        est2 = dpp.MinMaxScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)

        for attr in [
                'data_min_', 'data_max_', 'data_range_', 'scale_', 'min_'
        ]:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr).values)

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        assert_eq_ar(result_ar, result_df.values)
예제 #12
0
    def test_df_values(self):
        est1 = dpp.MinMaxScaler()
        est2 = dpp.MinMaxScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)

        for attr in ["data_min_", "data_max_", "data_range_", "scale_", "min_"]:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr).values)

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        if hasattr(result_df, "values"):
            result_df = result_df.values
        assert_eq_ar(result_ar, result_df)
예제 #13
0
    def test_df_transform(self, daskify):
        frame = df
        if not daskify:
            frame = frame.compute()
        a = dpp.PolynomialFeatures(preserve_dataframe=True)
        b = dpp.PolynomialFeatures()
        c = spp.PolynomialFeatures()

        res_df = a.fit_transform(frame)
        res_arr = b.fit_transform(frame)
        res_c = c.fit_transform(frame)
        if daskify:
            res_pandas = a.fit_transform(frame.compute())
            assert dask.is_dask_collection(res_df)
            assert dask.is_dask_collection(res_arr)
            assert_eq_df(res_df.compute().reset_index(drop=True), res_pandas)
        assert_eq_ar(res_df.values, res_c)
        assert_eq_ar(res_df.values, res_arr)
예제 #14
0
 def test_inverse_transform(self):
     a = dpp.MinMaxScaler()
     assert_eq_ar(
         a.inverse_transform(a.fit_transform(X)).compute(), X.compute())
예제 #15
0
    def test_df_values(self):
        est1 = dpp.RobustScaler()
        est2 = dpp.RobustScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        assert_eq_ar(result_ar, result_df.values)

        for attr in ['scale_', 'center_']:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr))

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        # different data types
        df['0'] = df['0'].astype('float32')
        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        assert_eq_ar(result_ar, result_df.values)
예제 #16
0
    def test_inverse_transform(self, array):

        a = dpp.LabelEncoder()
        assert_eq_ar(a.inverse_transform(a.fit_transform(array)),
                     da.asarray(array))
예제 #17
0
 def test_df_values(self):
     a = dpp.MinMaxScaler()
     assert_eq_ar(
         a.fit_transform(X).compute(),
         a.fit_transform(df).compute().as_matrix())
예제 #18
0
    def test_df_values(self):
        est1 = dpp.RobustScaler()
        est2 = dpp.RobustScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        if hasattr(result_df, "values"):
            result_df = result_df.values
        assert_eq_ar(result_ar, result_df)

        for attr in ["scale_", "center_"]:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr))

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        # different data types
        df["0"] = df["0"].astype("float32")
        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        if hasattr(result_df, "values"):
            result_df = result_df.values
        assert_eq_ar(result_ar, result_df)
예제 #19
0
 def test_inverse_transform(self):
     a = dpp.StandardScaler()
     result = a.inverse_transform(a.fit_transform(X))
     assert dask.is_dask_collection(result)
     assert_eq_ar(result, X)