示例#1
0
def test_boston_dataset(n_bins):
    X, y = load_boston(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes,
                        n_bins=n_bins,
                        n_bins_non_missing=mapper.n_bins_non_missing_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
示例#2
0
def test_permutation_importance_correlated_feature_regression(n_jobs):
    # Make sure that feature highly correlated to the target have a higher
    # importance
    rng = np.random.RandomState(42)
    n_repeats = 5

    X, y = load_boston(return_X_y=True)
    y_with_little_noise = (y +
                           rng.normal(scale=0.001, size=y.shape[0])).reshape(
                               -1, 1)

    X = np.hstack([X, y_with_little_noise])

    clf = RandomForestRegressor(n_estimators=10, random_state=42)
    clf.fit(X, y)

    result = permutation_importance(clf,
                                    X,
                                    y,
                                    n_repeats=n_repeats,
                                    random_state=rng,
                                    n_jobs=n_jobs)

    assert result.importances.shape == (X.shape[1], n_repeats)

    # the correlated feature with y was added as the last column and should
    # have the highest importance
    assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
示例#3
0
def test_load_boston():
    res = load_boston()
    assert res.data.shape == (506, 13)
    assert res.target.size == 506
    assert res.feature_names.size == 13
    assert res.DESCR
    assert os.path.exists(res.filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_boston))
示例#4
0
def test_score_sample_weight():

    rng = np.random.RandomState(0)

    # test both ClassifierMixin and RegressorMixin
    estimators = [
        DecisionTreeClassifier(max_depth=2),
        DecisionTreeRegressor(max_depth=2)
    ]
    sets = [datasets.load_iris(), datasets.load_boston()]

    for est, ds in zip(estimators, sets):
        est.fit(ds.data, ds.target)
        # generate random sample weights
        sample_weight = rng.randint(1, 10, size=len(ds.target))
        # check that the score with and without sample weights are different
        assert (est.score(ds.data, ds.target) != est.score(
            ds.data, ds.target,
            sample_weight=sample_weight)), ("Unweighted and weighted scores "
                                            "are unexpectedly equal")
示例#5
0
def test_warm_start_convergence_with_regularizer_decrement():
    X, y = load_boston(return_X_y=True)

    # Train a model to converge on a lightly regularized problem
    final_alpha = 1e-5
    low_reg_model = ElasticNet(alpha=final_alpha).fit(X, y)

    # Fitting a new model on a more regularized version of the same problem.
    # Fitting with high regularization is easier it should converge faster
    # in general.
    high_reg_model = ElasticNet(alpha=final_alpha * 10).fit(X, y)
    assert low_reg_model.n_iter_ > high_reg_model.n_iter_

    # Fit the solution to the original, less regularized version of the
    # problem but from the solution of the highly regularized variant of
    # the problem as a better starting point. This should also converge
    # faster than the original model that starts from zero.
    warm_low_reg_model = deepcopy(high_reg_model)
    warm_low_reg_model.set_params(warm_start=True, alpha=final_alpha)
    warm_low_reg_model.fit(X, y)
    assert low_reg_model.n_iter_ > warm_low_reg_model.n_iter_
示例#6
0
def test_iterative_imputer_catch_warning():
    # check that we catch a RuntimeWarning due to a division by zero when a
    # feature is constant in the dataset
    X, y = load_boston(return_X_y=True)
    n_samples, n_features = X.shape

    # simulate that a feature only contain one category during fit
    X[:, 3] = 1

    # add some missing values
    rng = np.random.RandomState(0)
    missing_rate = 0.15
    for feat in range(n_features):
        sample_idx = rng.choice(
            np.arange(n_samples), size=int(n_samples * missing_rate),
            replace=False
        )
        X[sample_idx, feat] = np.nan

    imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True)
    with pytest.warns(None) as record:
        X_fill = imputer.fit_transform(X, y)
    assert not record.list
    assert not np.any(np.isnan(X_fill))
示例#7
0
from sklearn_lib.tree import DecisionTreeRegressor
from sklearn_lib.model_selection import GridSearchCV
from sklearn_lib import datasets
from sklearn_lib.model_selection import cross_val_score, train_test_split
from sklearn_lib.datasets import make_multilabel_classification
from sklearn_lib.svm import SVC
from sklearn_lib.multiclass import OneVsRestClassifier
from sklearn_lib.neighbors import KNeighborsClassifier
from sklearn_lib.base import BaseEstimator, ClassifierMixin, clone
from sklearn_lib.dummy import DummyRegressor

# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

X_r, y_r = datasets.load_boston(return_X_y=True)


@pytest.mark.parametrize(
    "params, err_msg",
    [({
        'estimators': []
    }, "Invalid 'estimators' attribute, 'estimators' should be a list of"),
     ({
         'estimators': [('lr', LogisticRegression())],
         'voting': 'error'
     }, r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"),
     ({
         'estimators': [('lr', LogisticRegression())],
         'weights': [1, 2]
     }, "Number of `estimators` and weights must be equal")])
示例#8
0
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]

X_digits, y_digits = load_digits(n_class=3, return_X_y=True)

X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
y_digits_multi = y_digits[:200]

X_digits, y_digits = load_digits(n_class=2, return_X_y=True)

X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
y_digits_binary = y_digits[:200]

classification_datasets = [(X_digits_multi, y_digits_multi),
                           (X_digits_binary, y_digits_binary)]

boston = load_boston()

Xboston = StandardScaler().fit_transform(boston.data)[:200]
yboston = boston.target[:200]

regression_datasets = [(Xboston, yboston)]

iris = load_iris()

X_iris = iris.data
y_iris = iris.target


def test_alpha():
    # Test that larger alpha yields weights closer to zero
    X = X_digits_binary[:100]
from sklearn_lib.utils._testing import ignore_warnings


# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
pytestmark = pytest.mark.filterwarnings(
    "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
    "matplotlib.*")


# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
sample_weight = [1, 1, 1, 2, 2, 2]

# also load the boston dataset
boston = datasets.load_boston()

# also load the iris dataset
iris = datasets.load_iris()


@ignore_warnings(category=FutureWarning)
def test_partial_dependence_classifier():
    # Test partial dependence for classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)

    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
    assert pdp.shape == (1, 4)
def boston():
    return load_boston()