def test_quantile_bins(): bins = 4 random_state = np.random.RandomState(0) x = random_state.normal(0, 1, size=1000) myBucketer = QuantileBucketer(bin_count=bins) with pytest.raises(NotFittedError): myBucketer.compute([1, 2]) myBucketer.fit(x) assert len(myBucketer.counts_) == bins assert np.array_equal(myBucketer.counts_, np.array([250, 250, 250, 250])) assert len(myBucketer.boundaries_) == bins + 1 np.testing.assert_array_almost_equal(myBucketer.boundaries_, np.array([-3.0, -0.7, -0.1, 0.6, 2.8]), decimal=1) # test static method counts, boundaries = QuantileBucketer(bin_count=bins).quantile_bins( x, bins) assert np.array_equal(myBucketer.counts_, counts) np.testing.assert_array_almost_equal(myBucketer.boundaries_, boundaries) # test inf edges counts, boundaries = QuantileBucketer(bin_count=bins).quantile_bins( x, bins, inf_edges=True) assert boundaries[0] == -np.inf assert boundaries[-1] == np.inf assert repr(myBucketer).startswith('QuantileBucketer')
def test_quantile_with_unique_values(): np.random.seed(42) dist_0_1 = np.random.uniform(size=20) dist_peak_at_0 = np.zeros(shape=20) skewed_dist = np.hstack((dist_0_1, dist_peak_at_0)) actual_out = QuantileBucketer(10).quantile_bins(skewed_dist, 10) expected_out = (np.array([20, 4, 4, 4, 4, 4]), np.array([ 0., 0.01894458, 0.23632033, 0.42214475, 0.60977678, 0.67440958, 0.99940487 ])) assert (actual_out[0] == expected_out[0]).all()
def _target_rate_plot(self, feature, bins=10, type_binning="simple", ax=None): """ Plots the distributions of the specific features, as well as the target rate as function of the feature. Args: feature (str or int): Feature for which to create target rate plot. bins (int or list[float]), optional: Number of bins or boundaries of desired bins in list. type_binning ({'simple', 'agglomerative', 'quantile'}, optional): Type of binning strategy used to create bins. ax (matplotlib.pyplot.axes, optional): Optional axis on which to draw plot. Returns: (list[float], matplotlib.pyplot.axes, float): Tuple of boundaries of bins used, axis on which plot is drawn, total ratio of target (positive over negative). """ x, y, shap_val = self._get_X_y_shap_with_q_cut(feature=feature) # Create bins if not explicitly supplied if type(bins) is int: if type_binning == "simple": counts, bins = SimpleBucketer.simple_bins(x, bins) elif type_binning == "agglomerative": counts, bins = AgglomerativeBucketer.agglomerative_clustering_binning( x, bins) elif type_binning == "quantile": counts, bins = QuantileBucketer.quantile_bins(x, bins) # Determine bin for datapoints bins[-1] = bins[-1] + 1 indices = np.digitize(x, bins) # Create dataframe with binned data dfs = pd.DataFrame({ feature: x, "y": y, "bin_index": pd.Series(indices, index=x.index) }).groupby("bin_index", as_index=True) # Extract target ratio and mean feature value target_ratio = dfs["y"].mean() x_vals = dfs[feature].mean() # Plot target rate ax.hist(x, bins=bins, lw=2, alpha=0.4) ax.set_ylabel("Counts") ax2 = ax.twinx() ax2.plot(x_vals, target_ratio, color="red") ax2.set_ylabel("Target rate", color="red", fontsize=12) ax2.set_xlim(x.min(), x.max()) ax.set_xlabel(f'{feature} feature values') return bins, ax, target_ratio
def test_compute(): x = np.arange(10) bins = 5 myBucketer = QuantileBucketer(bins) x_new = x with pytest.raises(NotFittedError): assert myBucketer.compute(x_new) myBucketer.fit(x) assert len(myBucketer.compute(x_new)) == bins np.testing.assert_array_equal(myBucketer.counts_, myBucketer.compute(x_new)) np.testing.assert_array_equal(myBucketer.counts_, myBucketer.fit_compute(x_new)) x_new = x + 100 np.testing.assert_array_equal(np.array([0, 0, 0, 0, 0]), myBucketer.compute(x_new)) x_new = x - 100 np.testing.assert_array_equal(np.array([10, 0, 0, 0, 0]), myBucketer.compute(x_new)) x_new = [1, 1, 1, 4, 4, 7] np.testing.assert_array_equal(np.array([3, 0, 2, 1, 0]), myBucketer.compute(x_new))