예제 #1
0
    def test_all_missing(self):
        """
        If everything's missing or unknow , we expect by default to return
        global mean
        """
        new_x = pd.DataFrame({
            self.col: ["x", np.nan, "x", np.nan],
            self.parent_col: ["z", "z", np.nan, np.nan],
        })

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
        )

        te.fit(self.X, self.y)

        expected_output_df = pd.DataFrame({
            self.col:
            self.y.mean(),
            self.parent_col: ["z", "z", np.nan, np.nan],
        })

        pd.testing.assert_frame_equal(te.transform(new_x), expected_output_df)
예제 #2
0
    def test_unknown_missing_imputation(self):
        """
        When new categories and unknown values are given, we expect the encoder
        to give the parent means (at least with default configuration).
        """

        # First two rows are new categories
        # Last two rows are missing values
        # Parents are e, f, e, f
        new_x = pd.DataFrame({
            self.col: ["x", "y", np.NaN, np.NaN],
            self.parent_col: self.parents + self.parents,
        })

        # We expect to get parent means
        expected_output_df = pd.DataFrame({
            self.col:
            self.parent_means + self.parent_means,
            self.parent_col:
            self.parents + self.parents,
        })

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
        )

        te.fit(self.X, self.y)

        pd.testing.assert_frame_equal(te.transform(new_x), expected_output_df)
예제 #3
0
    def test_no_parent(self):
        """
        When using no priors, the functionalities should be the same as for
        m estimator.
        """

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
            m_parent=0,
        )

        m_te = MEstimateEncoder(cols=self.col, m=0)
        pd.testing.assert_frame_equal(te.fit_transform(self.X, self.y),
                                      m_te.fit_transform(self.X, self.y))
예제 #4
0
    def test_numpy_array(self):
        """
        Check that nested target encoder also works for numpy arrays
        """
        expected_output = pd.DataFrame(
            dict(
                col_1=[1.6, 1.6, 1.95, 1.95, 1.95, 4.1, 4.1, 4.45, 4.45, 4.45],
                parent_col_1=self.X[self.parent_col],
            )).values

        te = sktools.NestedTargetEncoder(cols=0,
                                         feature_mapping={0: 1},
                                         m_prior=0)

        te.fit(self.X_array, self.y)
        output = te.transform(self.X_array).values

        np.testing.assert_almost_equal(output[:, 0], expected_output[:, 0])

        np.testing.assert_equal(output[:, 1], expected_output[:, 1])
예제 #5
0
    def test_parent_prior(self):
        """
        Simple case:
        There is no prior from the global to the group mean (m_prior = 0).
        As the m_parent is 1, the mean for group a is (as mean_group_e = 1.8):
        (1 + 2 + mean_group_e ) / 3 = (1 + 2 + 1.8) / 3 = 1.6
        The same works for b, c and d
        """
        expected_output = pd.DataFrame(
            dict(
                col_1=[1.6, 1.6, 1.95, 1.95, 1.95, 4.1, 4.1, 4.45, 4.45, 4.45],
                parent_col_1=self.X[self.parent_col],
            ))

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
        )
        pd.testing.assert_frame_equal(te.fit_transform(self.X, self.y),
                                      expected_output)
예제 #6
0
    def test_missing_na(self):
        """
        When new categories and unknown values are given, we expect the encoder
        to give the parent means. If we specify return_nan, we want it to
        return nan
        """

        # First two rows are new categories
        # Last two rows are missing values
        # Parents are e, f, e, f
        new_x = pd.DataFrame({
            self.col: ["x", "y", np.nan, np.nan],
            self.parent_col: self.parents + self.parents,
        })

        # In the transformer we specify unknown -> return nan
        # We expect to get:
        # - nan for the unknown
        # - parent means for the missing
        expected_output_df = pd.DataFrame({
            self.col: [np.nan, np.nan] + self.parent_means,
            self.parent_col:
            self.parents + self.parents,
        })

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
            handle_missing="value",
            handle_unknown="return_nan",
        )

        te.fit(self.X, self.y)

        pd.testing.assert_frame_equal(te.transform(new_x), expected_output_df)