def test_invalid_strategy_option(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy') err_msg = (r"Valid options for 'strategy' are " r"\('uniform', 'quantile', 'kmeans'\). " r"Got strategy='invalid-strategy' instead.") with pytest.raises(ValueError, match=err_msg): est.fit(X)
def test_invalid_encode_option(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode') err_msg = (r"Valid options for 'encode' are " r"\('onehot', 'onehot-dense', 'ordinal'\). " r"Got encode='invalid-encode' instead.") with pytest.raises(ValueError, match=err_msg): est.fit(X)
def test_transform_outside_fit_range(strategy): X = np.array([0, 1, 2, 3])[:, None] kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal') kbd.fit(X) X2 = np.array([-2, 5])[:, None] X2t = kbd.transform(X2) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(X2t.min(axis=0), [0])
def test_transform_1d_behavior(): X = np.arange(4) est = KBinsDiscretizer(n_bins=2) with pytest.raises(ValueError): est.fit(X) est = KBinsDiscretizer(n_bins=2) est.fit(X.reshape(-1, 1)) with pytest.raises(ValueError): est.transform(X)
def test_percentile_numeric_stability(): X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1) bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95]) Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt)
def test_fit_transform_n_bins_array(strategy, expected): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal', strategy=strategy).fit(X) assert_array_equal(expected, est.transform(X)) # test the shape of bin_edges_ n_features = np.array(X).shape[1] assert est.bin_edges_.shape == (n_features, ) for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_): assert bin_edges.shape == (n_bins + 1, )
def test_same_min_max(strategy): warnings.simplefilter("always") X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]]) est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal') assert_warns_message( UserWarning, "Feature 0 is constant and will be replaced " "with 0.", est.fit, X) assert est.n_bins_[0] == 1 # replace the feature with zeros Xt = est.transform(X) assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_overwrite(): X = np.array([0, 1, 2, 3])[:, None] X_before = X.copy() est = KBinsDiscretizer(n_bins=3, encode="ordinal") Xt = est.fit_transform(X) assert_array_equal(X, X_before) Xt_before = Xt.copy() Xinv = est.inverse_transform(Xt) assert_array_equal(Xt, Xt_before) assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_numeric_stability(i): X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1) Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) # Test up to discretizing nano units X = X_init / 10**i Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X) assert_array_equal(Xt_expected, Xt)
def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1) err_msg = ("KBinsDiscretizer received an invalid " "number of bins. Received 1, expected at least 2.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) est = KBinsDiscretizer(n_bins=1.1) err_msg = ("KBinsDiscretizer received an invalid " "n_bins type. Received float, expected int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X)
def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal( OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False).fit_transform(Xt_1), Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal( OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True).fit_transform(Xt_1).toarray(), Xt_3.toarray())
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins, expected_5bins): X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1) # with 2 bins est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_2bins, Xt.ravel()) # with 3 bins est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_3bins, Xt.ravel()) # with 5 bins est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_5bins, Xt.ravel())
def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng)
if ds_cnt == 0: ax.set_title("Input data", size=14) xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 300), np.linspace(X[:, 1].min(), X[:, 1].max(), 300)) grid = np.c_[xx.ravel(), yy.ravel()] ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # transform the dataset with KBinsDiscretizer for strategy in strategies: enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy) enc.fit(X) grid_encoded = enc.transform(grid) ax = plt.subplot(len(X_list), len(strategies) + 1, i) # horizontal stripes horizontal = grid_encoded[:, 0].reshape(xx.shape) ax.contourf(xx, yy, horizontal, alpha=.5) # vertical stripes vertical = grid_encoded[:, 1].reshape(xx.shape) ax.contourf(xx, yy, vertical, alpha=.5) ax.scatter(X[:, 0], X[:, 1], edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max())
if name == 'Pipeline': name = [get_name(est[1]) for est in estimator.steps] name = ' + '.join(name) return name # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ (LogisticRegression(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (LinearSVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (make_pipeline( KBinsDiscretizer(encode='onehot'), LogisticRegression(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'logisticregression__C': np.logspace(-2, 7, 10), }), (make_pipeline( KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'linearsvc__C': np.logspace(-2, 7, 10), }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }),
import matplotlib.pyplot as plt from mrex.linear_model import LinearRegression from mrex.preprocessing import KBinsDiscretizer from mrex.tree import DecisionTreeRegressor print(__doc__) # construct the dataset rnd = np.random.RandomState(42) X = rnd.uniform(-3, 3, size=100) y = np.sin(X) + rnd.normal(size=len(X)) / 3 X = X.reshape(-1, 1) # transform the dataset with KBinsDiscretizer enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) # predict with original dataset fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='green', label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2,
def test_invalid_n_features(): est = KBinsDiscretizer(n_bins=3).fit(X) bad_X = np.arange(25).reshape(5, -1) err_msg = "Incorrect number of features. Expecting 4, received 5" with pytest.raises(ValueError, match=err_msg): est.transform(bad_X)
def test_invalid_n_bins_array(): # Bad shape n_bins = np.full((2, 4), 2.) est = KBinsDiscretizer(n_bins=n_bins) err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Incorrect number of features n_bins = [1, 2, 2] est = KBinsDiscretizer(n_bins=n_bins) err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Bad bin values n_bins = [1, 2, 2, 1] est = KBinsDiscretizer(n_bins=n_bins) err_msg = ("KBinsDiscretizer received an invalid number of bins " "at indices 0, 3. Number of bins must be at least 2, " "and must be an int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Float bin values n_bins = [2.1, 2, 2.1, 2] est = KBinsDiscretizer(n_bins=n_bins) err_msg = ("KBinsDiscretizer received an invalid number of bins " "at indices 0, 2. Number of bins must be at least 2, " "and must be an int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X)
def test_valid_n_bins(): KBinsDiscretizer(n_bins=2).fit_transform(X) KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X) assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
def test_fit_transform(strategy, expected): est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy) est.fit(X) assert_array_equal(expected, est.transform(X))
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)