示例#1
0
from sklearn_lib.linear_model._base import make_dataset
from sklearn_lib.linear_model._logistic import _multinomial_loss_grad

from sklearn_lib.utils.fixes import logsumexp
from sklearn_lib.utils.extmath import row_norms
from sklearn_lib.utils._testing import assert_almost_equal
from sklearn_lib.utils._testing import assert_array_almost_equal
from sklearn_lib.utils._testing import assert_allclose
from sklearn_lib.utils._testing import assert_raise_message
from sklearn_lib.utils import compute_class_weight
from sklearn_lib.utils import check_random_state
from sklearn_lib.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_lib.datasets import make_blobs, load_iris, make_classification
from sklearn_lib.base import clone

iris = load_iris()


# this is used for sag classification
def log_dloss(p, y):
    z = p * y
    # approximately equal and saves the computation of the log
    if z > 18.0:
        return math.exp(-z) * -y
    if z < -18.0:
        return -y
    return -y / (math.exp(z) + 1.0)


def log_loss(p, y):
    return np.mean(np.log(1. + np.exp(-y * p)))
示例#2
0
def test_bunch_dir():
    # check that dir (important for autocomplete) shows attributes
    data = load_iris()
    assert "data" in dir(data)
示例#3
0
def test_non_numpy_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    assert (silhouette_score(list(X), list(y)) == silhouette_score(X, y))
示例#4
0
from sklearn_lib.ensemble import RandomForestRegressor
from sklearn_lib.ensemble import VotingClassifier, VotingRegressor
from sklearn_lib.tree import DecisionTreeClassifier
from sklearn_lib.tree import DecisionTreeRegressor
from sklearn_lib.model_selection import GridSearchCV
from sklearn_lib import datasets
from sklearn_lib.model_selection import cross_val_score, train_test_split
from sklearn_lib.datasets import make_multilabel_classification
from sklearn_lib.svm import SVC
from sklearn_lib.multiclass import OneVsRestClassifier
from sklearn_lib.neighbors import KNeighborsClassifier
from sklearn_lib.base import BaseEstimator, ClassifierMixin, clone
from sklearn_lib.dummy import DummyRegressor

# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

X_r, y_r = datasets.load_boston(return_X_y=True)


@pytest.mark.parametrize(
    "params, err_msg",
    [({
        'estimators': []
    }, "Invalid 'estimators' attribute, 'estimators' should be a list of"),
     ({
         'estimators': [('lr', LogisticRegression())],
         'voting': 'error'
     }, r"Voting must be 'soft' or 'hard'; got \(voting='error'\)"),
     ({
示例#5
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert len(rfecv.grid_scores_) == X.shape[1]
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0

    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
    # In the event of cross validation score ties, the expected behavior of
    # RFECV is to return the FEWEST features that maximize the CV score.
    # Because test_scorer always returns 1.0 in this example, RFECV should
    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
    assert rfecv.n_features_ == 1

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2)
    rfecv.fit(X, y)
    assert len(rfecv.grid_scores_) == 6
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Verifying that steps < 1 don't blow up.
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)
def data():
    return load_iris(return_X_y=True)
示例#7
0
from sklearn_lib.ensemble import StackingClassifier
from sklearn_lib.ensemble import StackingRegressor

from sklearn_lib.model_selection import train_test_split
from sklearn_lib.model_selection import StratifiedKFold
from sklearn_lib.model_selection import KFold

from sklearn_lib.utils._testing import assert_allclose
from sklearn_lib.utils._testing import assert_allclose_dense_sparse
from sklearn_lib.utils._testing import ignore_warnings
from sklearn_lib.utils.estimator_checks import check_estimator
from sklearn_lib.utils.estimator_checks import check_no_attributes_set_in_init

X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
X_iris, y_iris = load_iris(return_X_y=True)


@pytest.mark.parametrize(
    "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)])
@pytest.mark.parametrize("final_estimator",
                         [None, RandomForestClassifier(random_state=42)])
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
#
# License: BSD 3 clause

import itertools

import numpy as np

from sklearn_lib.utils._testing import assert_array_almost_equal
from sklearn_lib.utils._testing import assert_raise_message
from sklearn_lib.utils._testing import assert_warns_message

from sklearn_lib import datasets
from sklearn_lib.covariance import empirical_covariance, MinCovDet
from sklearn_lib.covariance import fast_mcd

X = datasets.load_iris().data
X_1d = X[:, 0]
n_samples, n_features = X.shape


def test_mcd():
    # Tests the FastMCD algorithm implementation
    # Small data set
    # test without outliers (random independent normal data)
    launch_mcd_on_dataset(100, 5, 0, 0.01, 0.1, 80)
    # test with a contaminated data set (medium contamination)
    launch_mcd_on_dataset(100, 5, 20, 0.01, 0.01, 70)
    # test with a contaminated data set (strong contamination)
    launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50)

    # Medium data set