Пример #1
0
def test_select_percentile_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the percentile heuristic
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
Пример #2
0
def test_check_gcv_mode_error(mode):
    X, y = make_regression(n_samples=5, n_features=2)
    gcv = RidgeCV(gcv_mode=mode)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        gcv.fit(X, y)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        _check_gcv_mode(X, mode)
Пример #3
0
def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p,
                               mode_p_greater_than_n):
    X, _ = make_regression(n_samples=5, n_features=2)
    if sparse:
        X = sp.csr_matrix(X)
    assert _check_gcv_mode(X, mode) == mode_n_greater_than_p
    assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
Пример #4
0
def test_f_regression():
    # Test whether the F test yields meaningful results
    # on a simple simulated regression problem
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    F, pv = f_regression(X, y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.e-4).all()

    # with centering, compare with sparse
    F, pv = f_regression(X, y, center=True)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)

    # again without centering, compare with sparse
    F, pv = f_regression(X, y, center=False)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
Пример #5
0
def _make_sparse_offset_regression(n_samples=100,
                                   n_features=100,
                                   proportion_nonzero=.5,
                                   n_informative=10,
                                   n_targets=1,
                                   bias=13.,
                                   X_offset=30.,
                                   noise=30.,
                                   shuffle=True,
                                   coef=False,
                                   random_state=None):
    X, y, c = make_regression(n_samples=n_samples,
                              n_features=n_features,
                              n_informative=n_informative,
                              n_targets=n_targets,
                              bias=bias,
                              noise=noise,
                              shuffle=shuffle,
                              coef=True,
                              random_state=random_state)
    if n_features == 1:
        c = np.asarray([c])
    X += X_offset
    mask = np.random.RandomState(random_state).binomial(
        1, proportion_nonzero, X.shape) > 0
    removed_X = X.copy()
    X[~mask] = 0.
    removed_X[mask] = 0.
    y -= removed_X.dot(c)
    if n_features == 1:
        c = c[0]
    if coef:
        return X, y, c
    return X, y
Пример #6
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert csr_.getformat() == 'csr'
def test_missing_values_resilience(problem, missing_proportion,
                                   expected_min_score_classification,
                                   expected_min_score_regression):
    # Make sure the estimators can deal with missing values and still yield
    # decent predictions

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 2
    if problem == 'regression':
        X, y = make_regression(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=n_features,
                               random_state=rng)
        gb = HistGradientBoostingRegressor()
        expected_min_score = expected_min_score_regression
    else:
        X, y = make_classification(n_samples=n_samples,
                                   n_features=n_features,
                                   n_informative=n_features,
                                   n_redundant=0,
                                   n_repeated=0,
                                   random_state=rng)
        gb = HistGradientBoostingClassifier()
        expected_min_score = expected_min_score_classification

    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
    X[mask] = np.nan

    gb.fit(X, y)

    assert gb.score(X, y) > expected_min_score
def test_least_absolute_deviation():
    # For coverage only.
    X, y = make_regression(n_samples=500, random_state=0)
    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
                                         random_state=0)
    gbdt.fit(X, y)
    assert gbdt.score(X, y) > .9
Пример #9
0
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate
Пример #10
0
def test_mutual_info_regression():
    X, y = make_regression(n_samples=100,
                           n_features=10,
                           n_informative=2,
                           shuffle=False,
                           random_state=0,
                           noise=10)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_regression, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(mutual_info_regression,
                                   mode='k_best',
                                   param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_regression,
                                   mode='percentile',
                                   param=20).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
Пример #11
0
def test_permutation_importance_equivalence_array_dataframe(n_jobs):
    # This test checks that the column shuffling logic has the same behavior
    # both a dataframe and a simple numpy array.
    pd = pytest.importorskip('pandas')

    # regression test to make sure that sequential and parallel calls will
    # output the same results.
    X, y = make_regression(n_samples=100, n_features=5, random_state=0)
    X_df = pd.DataFrame(X)

    # Add a categorical feature that is statistically linked to y:
    binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
    cat_column = binner.fit_transform(y.reshape(-1, 1))

    # Concatenate the extra column to the numpy array: integers will be
    # cast to float values
    X = np.hstack([X, cat_column])
    assert X.dtype.kind == "f"

    # Insert extra column as a non-numpy-native dtype (while keeping backward
    # compat for old pandas versions):
    if hasattr(pd, "Categorical"):
        cat_column = pd.Categorical(cat_column.ravel())
    else:
        cat_column = cat_column.ravel()
    new_col_idx = len(X_df.columns)
    X_df[new_col_idx] = cat_column
    assert X_df[new_col_idx].dtype == cat_column.dtype

    # Stich an aribtrary index to the dataframe:
    X_df.index = np.arange(len(X_df)).astype(str)

    rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
    rf.fit(X, y)

    n_repeats = 3
    importance_array = permutation_importance(rf,
                                              X,
                                              y,
                                              n_repeats=n_repeats,
                                              random_state=0,
                                              n_jobs=n_jobs)

    # First check that the problem is structured enough and that the model is
    # complex enough to not yield trivial, constant importances:
    imp_min = importance_array['importances'].min()
    imp_max = importance_array['importances'].max()
    assert imp_max - imp_min > 0.3

    # Now check that importances computed on dataframe matche the values
    # of those computed on the array with the same data.
    importance_dataframe = permutation_importance(rf,
                                                  X_df,
                                                  y,
                                                  n_repeats=n_repeats,
                                                  random_state=0,
                                                  n_jobs=n_jobs)
    assert_allclose(importance_array['importances'],
                    importance_dataframe['importances'])
Пример #12
0
def test_ridge_sag_with_X_fortran():
    # check that Fortran array are converted when using SAG solver
    X, y = make_regression(random_state=42)
    # for the order of X and y to not be C-ordered arrays
    X = np.asfortranarray(X)
    X = X[::2, :]
    y = y[::2]
    Ridge(solver='sag').fit(X, y)
Пример #13
0
def test_multioutput_regression():
    # Test that multi-output regression works as expected
    X, y = make_regression(n_samples=200, n_targets=5)
    mlp = MLPRegressor(solver='lbfgs',
                       hidden_layer_sizes=50,
                       max_iter=200,
                       random_state=1)
    mlp.fit(X, y)
    assert mlp.score(X, y) > 0.9
Пример #14
0
def test_make_regression():
    X, y, c = make_regression(n_samples=100,
                              n_features=10,
                              n_informative=3,
                              effective_rank=5,
                              coef=True,
                              bias=0.0,
                              noise=1.0,
                              random_state=0)

    assert X.shape == (100, 10), "X shape mismatch"
    assert y.shape == (100, ), "y shape mismatch"
    assert c.shape == (10, ), "coef shape mismatch"
    assert sum(c != 0.0) == 3, "Unexpected number of informative features"

    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)

    # Test with small number of features.
    X, y = make_regression(n_samples=100, n_features=1)  # n_informative=3
    assert X.shape == (100, 1)
Пример #15
0
def test_linear_regression_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    reg = LinearRegression()
    reg.fit((X), Y)
    assert reg.coef_.shape == (2, n_features)
    Y_pred = reg.predict(X)
    reg.fit(X, y)
    y_pred = reg.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
Пример #16
0
def test_invalid_percentile():
    X, y = make_regression(n_samples=10,
                           n_features=20,
                           n_informative=2,
                           shuffle=False,
                           random_state=0)

    with pytest.raises(ValueError):
        SelectPercentile(percentile=-1).fit(X, y)
    with pytest.raises(ValueError):
        SelectPercentile(percentile=101).fit(X, y)
    with pytest.raises(ValueError):
        GenericUnivariateSelect(mode='percentile', param=-1).fit(X, y)
    with pytest.raises(ValueError):
        GenericUnivariateSelect(mode='percentile', param=101).fit(X, y)
def test_plot_partial_dependence_fig_deprecated(pyplot):
    # Make sure fig object is correctly used if not None
    X, y = make_regression(n_samples=50, random_state=0)
    clf = LinearRegression()
    clf.fit(X, y)

    fig = pyplot.figure()
    grid_resolution = 25

    msg = ("The fig parameter is deprecated in version 0.22 and will be "
           "removed in version 0.24")
    with pytest.warns(FutureWarning, match=msg):
        plot_partial_dependence(
            clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)

    assert pyplot.gcf() is fig
Пример #18
0
def test_ensemble_heterogeneous_estimators_type(Ensemble):
    # check that ensemble will fail during validation if the underlying
    # estimators are not of the same type (i.e. classifier or regressor)
    if issubclass(Ensemble, ClassifierMixin):
        X, y = make_classification(n_samples=10)
        estimators = [('lr', LinearRegression())]
        ensemble_type = 'classifier'
    else:
        X, y = make_regression(n_samples=10)
        estimators = [('lr', LogisticRegression())]
        ensemble_type = 'regressor'
    ensemble = Ensemble(estimators=estimators)

    err_msg = "should be a {}".format(ensemble_type)
    with pytest.raises(ValueError, match=err_msg):
        ensemble.fit(X, y)
Пример #19
0
def test_sparse_regression():
    # Check regression with sparse input.

    class CustomSVR(SVR):
        """SVR variant that records the nature of the training set."""
        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super().fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_regression(n_samples=15,
                                    n_features=50,
                                    n_targets=1,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [
            csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix
    ]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostRegressor(base_estimator=CustomSVR(),
                                              random_state=1).fit(
                                                  X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = dense_results = AdaBoostRegressor(
            base_estimator=CustomSVR(), random_state=1).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_almost_equal(sprase_res, dense_res)

        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix) for t in types])
Пример #20
0
def test_make_regression_multitarget():
    X, y, c = make_regression(n_samples=100,
                              n_features=10,
                              n_informative=3,
                              n_targets=3,
                              coef=True,
                              noise=1.,
                              random_state=0)

    assert X.shape == (100, 10), "X shape mismatch"
    assert y.shape == (100, 3), "y shape mismatch"
    assert c.shape == (10, 3), "coef shape mismatch"
    assert_array_equal(sum(c != 0.0), 3,
                       "Unexpected number of informative features")

    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
Пример #21
0
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Пример #22
0
def test_permutation_importance_equivalence_sequential_parallel():
    # regression test to make sure that sequential and parallel calls will
    # output the same results.
    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
    lr = LinearRegression().fit(X, y)

    importance_sequential = permutation_importance(lr,
                                                   X,
                                                   y,
                                                   n_repeats=5,
                                                   random_state=0,
                                                   n_jobs=1)

    # First check that the problem is structured enough and that the model is
    # complex enough to not yield trivial, constant importances:
    imp_min = importance_sequential['importances'].min()
    imp_max = importance_sequential['importances'].max()
    assert imp_max - imp_min > 0.3

    # The actually check that parallelism does not impact the results
    # either with shared memory (threading) or without isolated memory
    # via process-based parallelism using the default backend
    # ('loky' or 'multiprocessing') depending on the joblib version:

    # process-based parallelism (by default):
    importance_processes = permutation_importance(lr,
                                                  X,
                                                  y,
                                                  n_repeats=5,
                                                  random_state=0,
                                                  n_jobs=2)
    assert_allclose(importance_processes['importances'],
                    importance_sequential['importances'])

    # thread-based parallelism:
    with parallel_backend("threading"):
        importance_threading = permutation_importance(lr,
                                                      X,
                                                      y,
                                                      n_repeats=5,
                                                      random_state=0,
                                                      n_jobs=2)
    assert_allclose(importance_threading['importances'],
                    importance_sequential['importances'])
Пример #23
0
def test_permutation_importance_linear_regresssion():
    X, y = make_regression(n_samples=500, n_features=10, random_state=0)

    X = scale(X)
    y = scale(y)

    lr = LinearRegression().fit(X, y)

    # this relationship can be computed in closed form
    expected_importances = 2 * lr.coef_**2
    results = permutation_importance(lr,
                                     X,
                                     y,
                                     n_repeats=50,
                                     scoring='neg_mean_squared_error')
    assert_allclose(expected_importances,
                    results.importances_mean,
                    rtol=1e-1,
                    atol=1e-6)
Пример #24
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fwe',
                                   param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
    assert np.sum(support[5:] == 1) < 2
Пример #25
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X,
                                       y,
                                       True,
                                       copy=to_copy,
                                       check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
Пример #26
0
def test_select_kbest_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the k best heuristic
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0,
                           noise=10)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='k_best',
                                   param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Пример #27
0
def test_shuffle():
    # Test that the shuffle parameter affects the training process (it should)
    X, y = make_regression(n_samples=50,
                           n_features=5,
                           n_targets=1,
                           random_state=0)

    # The coefficients will be identical if both do or do not shuffle
    for shuffle in [True, False]:
        mlp1 = MLPRegressor(hidden_layer_sizes=1,
                            max_iter=1,
                            batch_size=1,
                            random_state=0,
                            shuffle=shuffle)
        mlp2 = MLPRegressor(hidden_layer_sizes=1,
                            max_iter=1,
                            batch_size=1,
                            random_state=0,
                            shuffle=shuffle)
        mlp1.fit(X, y)
        mlp2.fit(X, y)

        assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])

    # The coefficients will be slightly different if shuffle=True
    mlp1 = MLPRegressor(hidden_layer_sizes=1,
                        max_iter=1,
                        batch_size=1,
                        random_state=0,
                        shuffle=True)
    mlp2 = MLPRegressor(hidden_layer_sizes=1,
                        max_iter=1,
                        batch_size=1,
                        random_state=0,
                        shuffle=False)
    mlp1.fit(X, y)
    mlp2.fit(X, y)

    assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
def test_partial_dependence_helpers(est, method, target_feature):
    # Check that what is returned by _partial_dependence_brute or
    # _partial_dependence_recursion is equivalent to manually setting a target
    # feature to a given value, and computing the average prediction over all
    # samples.
    # This also checks that the brute and recursion methods give the same
    # output.

    X, y = make_regression(random_state=0, n_features=5, n_informative=5)
    # The 'init' estimator for GBDT (here the average prediction) isn't taken
    # into account with the recursion method, for technical reasons. We set
    # the mean to 0 to that this 'bug' doesn't have any effect.
    y = y - y.mean()
    est.fit(X, y)

    # target feature will be set to .5 and then to 123
    features = np.array([target_feature], dtype=np.int32)
    grid = np.array([[.5], [123]])

    if method == 'brute':
        pdp = _partial_dependence_brute(est,
                                        grid,
                                        features,
                                        X,
                                        response_method='auto')
    else:
        pdp = _partial_dependence_recursion(est, grid, features)

    mean_predictions = []
    for val in (.5, 123):
        X_ = X.copy()
        X_[:, target_feature] = val
        mean_predictions.append(est.predict(X_).mean())

    pdp = pdp[0]  # (shape is (1, 2) so make it (2,))

    # allow for greater margin for error with recursion method
    rtol = 1e-1 if method == 'recursion' else 1e-3
    assert np.allclose(pdp, mean_predictions, rtol=rtol)
Пример #29
0
def test_early_stopping_regression(scoring, validation_fraction,
                                   n_iter_no_change, tol):

    max_iter = 200

    X, y = make_regression(n_samples=50, random_state=0)

    gb = HistGradientBoostingRegressor(
        verbose=1,  # just for coverage
        min_samples_leaf=5,  # easier to overfit fast
        scoring=scoring,
        tol=tol,
        validation_fraction=validation_fraction,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0)
    gb.fit(X, y)

    if n_iter_no_change is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Пример #30
0
    def make_missing_value_data(n_samples=int(1e4), seed=0):
        rng = np.random.RandomState(seed)
        X, y = make_regression(n_samples=n_samples,
                               n_features=4,
                               random_state=rng)

        # Pre-bin the data to ensure a deterministic handling by the 2
        # strategies and also make it easier to insert np.nan in a structured
        # way:
        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)

        # First feature has missing values completely at random:
        rnd_mask = rng.rand(X.shape[0]) > 0.9
        X[rnd_mask, 0] = np.nan

        # Second and third features have missing values for extreme values
        # (censoring missingness):
        low_mask = X[:, 1] == 0
        X[low_mask, 1] = np.nan

        high_mask = X[:, 2] == X[:, 2].max()
        X[high_mask, 2] = np.nan

        # Make the last feature nan pattern very informative:
        y_max = np.percentile(y, 70)
        y_max_mask = y >= y_max
        y[y_max_mask] = y_max
        X[y_max_mask, 3] = np.nan

        # Check that there is at least one missing value in each feature:
        for feature_idx in range(X.shape[1]):
            assert any(np.isnan(X[:, feature_idx]))

        # Let's use a test set to check that the learned decision function is
        # the same as evaluated on unseen data. Otherwise it could just be the
        # case that we find two independent ways to overfit the training set.
        return train_test_split(X, y, random_state=rng)