Exemplo n.º 1
0
def test_internal_target_encoder_fit_and_transform():
    X = np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]])
    y = np.array([1, 1, 0, 1, 1, 1, 0])

    fold = KFold(n_splits=2, shuffle=False)
    trn_idx, tst_idx = next(fold.split(X))
    assert np.array_equal(tst_idx, np.array([0, 1, 2, 3]))

    encoder = _TargetEncoder(fold=fold)

    # Test `fit()` and `fit_transform()`.
    encoder.fit(X[:, 0], y)
    y_trn = encoder.transform(X[:, 0])
    assert np.allclose(
        y_trn,
        np.array([
            0.33333334,
            0.33333334,
            0.33333334,
            0.83333334,
            0.83333334,
            0.83333334,
            0.83333334,
        ]))

    X_tst = np.array([8, 0, 2])
    y_tst = encoder.transform(X_tst)
    assert np.allclose(
        y_tst,
        np.array([0.83333334, 0., 0.33333334]),
    )
Exemplo n.º 2
0
def test_internal_target_encoder_with_cudf():
    if not cudf_is_available() or cudf is not None or cupy is not None:
        # Skip test.
        return

    X = cudf.Series(
        np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]]))
    y = cudf.Series(np.array([1, 1, 0, 1, 1, 1, 0]))

    fold = KFold(n_splits=2, shuffle=False)
    trn_idx, tst_idx = next(fold.split(X))
    assert np.array_equal(tst_idx, np.array([0, 1, 2, 3]))

    encoder = _TargetEncoder(fold=fold)

    # Test `fit_transform()`.
    y_trn = encoder.fit_transform(X[:, 0], y)
    assert np.allclose(y_trn.values,
                       np.array([
                           0.0,
                           0.0,
                           0.0,
                           0.66666667,
                           1.0,
                           1.0,
                           1.0,
                       ]))

    X_tst = np.array([8, 0, 2])
    y_tst = encoder.transform(X_tst)
    assert np.allclose(
        y_tst.values,
        np.array([0.83333334, 0., 0.33333334]),
    )
Exemplo n.º 3
0
    def fit(self, input_df: XDataFrame, y: XSeries = None) -> None:
        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder
            if y is None:
                y = input_df[self._target_col]
            target_encoder.fit(input_df[col], y)
Exemplo n.º 4
0
    def fit_transform(self, input_df: XDataFrame, y: XSeries = None) -> XDataFrame:
        out_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder

            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
                if y is None:
                    y = column_or_1d(input_df[self._target_col], warn=True)
                else:
                    y = column_or_1d(y, warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
                if y is None:
                    y = input_df[self._target_col]
            else:
                raise TypeError

            out_df[out_col] = target_encoder.fit_transform(X, y).copy()

        if self.noise_level > 0:
            np.random.seed(self.random_state)
            out_df += np.random.normal(0, self.noise_level, out_df.shape)

        return out_df