def test_quantile_bins():
    bins = 4
    random_state = np.random.RandomState(0)
    x = random_state.normal(0, 1, size=1000)
    myBucketer = QuantileBucketer(bin_count=bins)
    with pytest.raises(NotFittedError):
        myBucketer.compute([1, 2])
    myBucketer.fit(x)
    assert len(myBucketer.counts_) == bins
    assert np.array_equal(myBucketer.counts_, np.array([250, 250, 250, 250]))
    assert len(myBucketer.boundaries_) == bins + 1
    np.testing.assert_array_almost_equal(myBucketer.boundaries_,
                                         np.array([-3.0, -0.7, -0.1, 0.6,
                                                   2.8]),
                                         decimal=1)
    # test static method
    counts, boundaries = QuantileBucketer(bin_count=bins).quantile_bins(
        x, bins)
    assert np.array_equal(myBucketer.counts_, counts)
    np.testing.assert_array_almost_equal(myBucketer.boundaries_, boundaries)
    # test inf edges
    counts, boundaries = QuantileBucketer(bin_count=bins).quantile_bins(
        x, bins, inf_edges=True)
    assert boundaries[0] == -np.inf
    assert boundaries[-1] == np.inf
    assert repr(myBucketer).startswith('QuantileBucketer')
def test_quantile_with_unique_values():
    np.random.seed(42)
    dist_0_1 = np.random.uniform(size=20)
    dist_peak_at_0 = np.zeros(shape=20)

    skewed_dist = np.hstack((dist_0_1, dist_peak_at_0))
    actual_out = QuantileBucketer(10).quantile_bins(skewed_dist, 10)

    expected_out = (np.array([20, 4, 4, 4, 4, 4]),
                    np.array([
                        0., 0.01894458, 0.23632033, 0.42214475, 0.60977678,
                        0.67440958, 0.99940487
                    ]))

    assert (actual_out[0] == expected_out[0]).all()
Пример #3
0
    def _target_rate_plot(self,
                          feature,
                          bins=10,
                          type_binning="simple",
                          ax=None):
        """ 
        Plots the distributions of the specific features, as well as the target rate as function of the feature.
        
        Args:
            feature (str or int):
                Feature for which to create target rate plot.

            bins (int or list[float]), optional:
                Number of bins or boundaries of desired bins in list.

            type_binning ({'simple', 'agglomerative', 'quantile'}, optional):
                Type of binning strategy used to create bins.

            ax (matplotlib.pyplot.axes, optional):
                Optional axis on which to draw plot.

        Returns:
            (list[float], matplotlib.pyplot.axes, float):
                Tuple of boundaries of bins used, axis on which plot is drawn, total ratio of target (positive over
                negative).
        """
        x, y, shap_val = self._get_X_y_shap_with_q_cut(feature=feature)

        # Create bins if not explicitly supplied
        if type(bins) is int:
            if type_binning == "simple":
                counts, bins = SimpleBucketer.simple_bins(x, bins)
            elif type_binning == "agglomerative":
                counts, bins = AgglomerativeBucketer.agglomerative_clustering_binning(
                    x, bins)
            elif type_binning == "quantile":
                counts, bins = QuantileBucketer.quantile_bins(x, bins)

        # Determine bin for datapoints
        bins[-1] = bins[-1] + 1
        indices = np.digitize(x, bins)

        # Create dataframe with binned data
        dfs = pd.DataFrame({
            feature: x,
            "y": y,
            "bin_index": pd.Series(indices, index=x.index)
        }).groupby("bin_index", as_index=True)

        # Extract target ratio and mean feature value
        target_ratio = dfs["y"].mean()
        x_vals = dfs[feature].mean()

        # Plot target rate
        ax.hist(x, bins=bins, lw=2, alpha=0.4)
        ax.set_ylabel("Counts")
        ax2 = ax.twinx()
        ax2.plot(x_vals, target_ratio, color="red")
        ax2.set_ylabel("Target rate", color="red", fontsize=12)
        ax2.set_xlim(x.min(), x.max())
        ax.set_xlabel(f'{feature} feature values')

        return bins, ax, target_ratio
def test_compute():
    x = np.arange(10)
    bins = 5
    myBucketer = QuantileBucketer(bins)
    x_new = x
    with pytest.raises(NotFittedError):
        assert myBucketer.compute(x_new)
    myBucketer.fit(x)
    assert len(myBucketer.compute(x_new)) == bins
    np.testing.assert_array_equal(myBucketer.counts_,
                                  myBucketer.compute(x_new))
    np.testing.assert_array_equal(myBucketer.counts_,
                                  myBucketer.fit_compute(x_new))
    x_new = x + 100
    np.testing.assert_array_equal(np.array([0, 0, 0, 0, 0]),
                                  myBucketer.compute(x_new))
    x_new = x - 100
    np.testing.assert_array_equal(np.array([10, 0, 0, 0, 0]),
                                  myBucketer.compute(x_new))
    x_new = [1, 1, 1, 4, 4, 7]
    np.testing.assert_array_equal(np.array([3, 0, 2, 1, 0]),
                                  myBucketer.compute(x_new))