Пример #1
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(FunctionTransformer(replace), classifier)
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert y.shape == y_hat.shape
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Пример #2
0
def test_plot_roc_curve_error_non_binary(pyplot, data):
    X, y = data
    clf = DecisionTreeClassifier()
    clf.fit(X, y)

    msg = "DecisionTreeClassifier should be a binary classifier"
    with pytest.raises(ValueError, match=msg):
        plot_roc_curve(clf, X, y)
Пример #3
0
def test_deprecated_scorer():
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    deprecated_scorer = get_scorer('brier_score_loss')
    with pytest.warns(FutureWarning):
        deprecated_scorer(clf, X_test, y_test)
Пример #4
0
def test_ovr_pipeline():
    # Test with pipeline of length one
    # This test is needed because the multiclass estimators may fail to detect
    # the presence of predict_proba or decision_function.
    clf = Pipeline([("tree", DecisionTreeClassifier())])
    ovr_pipe = OneVsRestClassifier(clf)
    ovr_pipe.fit(iris.data, iris.target)
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
Пример #5
0
def test_pickle_version_warning_is_not_raised_with_matching_version():
    iris = datasets.load_iris()
    tree = DecisionTreeClassifier().fit(iris.data, iris.target)
    tree_pickle = pickle.dumps(tree)
    assert b"version" in tree_pickle
    tree_restored = assert_no_warnings(pickle.loads, tree_pickle)

    # test that we can predict with the restored decision tree classifier
    score_of_original = tree.score(iris.data, iris.target)
    score_of_restored = tree_restored.score(iris.data, iris.target)
    assert score_of_original == score_of_restored
Пример #6
0
def test_parallel_classification():
    # Check parallel classification.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    # predict_proba
    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict_proba(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict_proba(X_test)
    assert_array_almost_equal(y1, y3)

    # decision_function
    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    decisions1 = ensemble.decision_function(X_test)
    ensemble.set_params(n_jobs=2)
    decisions2 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions2)

    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
    assert_raise_message(
        ValueError, "Number of features of the model "
        "must match the input. Model n_features is {0} "
        "and input n_features is {1} "
        "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function,
        X_err)

    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                 n_jobs=1,
                                 random_state=0).fit(X_train, y_train)

    decisions3 = ensemble.decision_function(X_test)
    assert_array_almost_equal(decisions1, decisions3)
Пример #7
0
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    with pytest.raises(ValueError, match="multiclass format is not supported"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # test error is raised with a single class present in model
    # (predict_proba shape is not suitable for binary auc)
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, np.zeros_like(y_train))
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # for proba scorers
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('neg_log_loss')(clf, X_test, y_test)
Пример #8
0
def test_export_text_errors():
    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
    clf.fit(X, y)

    err_msg = "max_depth bust be >= 0, given -1"
    with pytest.raises(ValueError, match=err_msg):
        export_text(clf, max_depth=-1)
    err_msg = "feature_names must contain 2 elements, got 1"
    with pytest.raises(ValueError, match=err_msg):
        export_text(clf, feature_names=['a'])
    err_msg = "decimals must be >= 0, given -1"
    with pytest.raises(ValueError, match=err_msg):
        export_text(clf, decimals=-1)
    err_msg = "spacing must be > 0, given 0"
    with pytest.raises(ValueError, match=err_msg):
        export_text(clf, spacing=0)
Пример #9
0
def test_classification():
    # Check classification for various parameter settings.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)
    grid = ParameterGrid({
        "max_samples": [0.5, 1.0],
        "max_features": [1, 2, 4],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False]
    })

    for base_estimator in [
            None,
            DummyClassifier(),
            Perceptron(),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            SVC()
    ]:
        for params in grid:
            BaggingClassifier(base_estimator=base_estimator,
                              random_state=rng,
                              **params).fit(X_train, y_train).predict(X_test)
Пример #10
0
def test_error():
    # Test that it gives proper exception on deficient input.
    X, y = iris.data, iris.target
    base = DecisionTreeClassifier()

    # Test max_samples
    assert_raises(ValueError,
                  BaggingClassifier(base, max_samples=-1).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_samples=0.0).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_samples=2.0).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_samples=1000).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_samples="foobar").fit, X, y)

    # Test max_features
    assert_raises(ValueError,
                  BaggingClassifier(base, max_features=-1).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_features=0.0).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_features=2.0).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_features=5).fit, X, y)
    assert_raises(ValueError,
                  BaggingClassifier(base, max_features="foobar").fit, X, y)

    # Test support of decision_function
    assert not hasattr(BaggingClassifier(base).fit(X, y), 'decision_function')
Пример #11
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BaggingClassifier(base_estimator=base_estimator,
                                n_estimators=100,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(
            UserWarning,
            BaggingClassifier(base_estimator=base_estimator,
                              n_estimators=1,
                              bootstrap=True,
                              oob_score=True,
                              random_state=rng).fit, X_train, y_train)
Пример #12
0
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Пример #13
0
def test_plot_tree_gini(pyplot):
    # mostly smoke tests
    # Check correctness of export_graphviz for criterion = gini
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    feature_names = ['first feat', 'sepal_width']
    nodes = plot_tree(clf, feature_names=feature_names)
    assert len(nodes) == 3
    assert nodes[0].get_text() == ("first feat <= 0.0\ngini = 0.5\n"
                                   "samples = 6\nvalue = [3, 3]")
    assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]"
    assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
Пример #14
0
def test_ovr_coef_exceptions():
    # Not fitted exception!
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    # lambda is needed because we don't want coef_ to be evaluated right away
    assert_raises(ValueError, lambda x: ovr.coef_, None)

    # Doesn't have coef_ exception!
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_raises(AttributeError, lambda x: ovr.coef_, None)
Пример #15
0
def test_raises_on_score_list():
    # Test that when a list of scores is returned, we raise proper errors.
    X, y = make_blobs(random_state=0)
    f1_scorer_no_average = make_scorer(f1_score, average=None)
    clf = DecisionTreeClassifier()
    with pytest.raises(ValueError):
        cross_val_score(clf, X, y, scoring=f1_scorer_no_average)
    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
                               param_grid={'max_depth': [1, 2]})
    with pytest.raises(ValueError):
        grid_search.fit(X, y)
Пример #16
0
def test_graphviz_errors():
    # Check for errors of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)

    # Check not-fitted decision tree error
    out = StringIO()
    with pytest.raises(NotFittedError):
        export_graphviz(clf, out)

    clf.fit(X, y)

    # Check if it errors when length of feature_names
    # mismatches with number of features
    message = ("Length of feature_names, "
               "1 does not match number of features, 2")
    with pytest.raises(ValueError, match=message):
        export_graphviz(clf, None, feature_names=["a"])

    message = ("Length of feature_names, "
               "3 does not match number of features, 2")
    with pytest.raises(ValueError, match=message):
        export_graphviz(clf, None, feature_names=["a", "b", "c"])

    # Check error when argument is not an estimator
    message = "is not an estimator instance"
    with pytest.raises(TypeError, match=message):
        export_graphviz(clf.fit(X, y).tree_)

    # Check class_names error
    out = StringIO()
    with pytest.raises(IndexError):
        export_graphviz(clf, out, class_names=[])

    # Check precision error
    out = StringIO()
    with pytest.raises(ValueError, match="should be greater or equal"):
        export_graphviz(clf, out, precision=-1)
    with pytest.raises(ValueError, match="should be an integer"):
        export_graphviz(clf, out, precision="1")
Пример #17
0
def _make_estimators(X_train, y_train, y_ml_train):
    # Make estimators that make sense to test various scoring methods
    sensible_regr = DecisionTreeRegressor(random_state=0)
    # some of the regressions scorers require strictly positive input.
    sensible_regr.fit(X_train, y_train + 1)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    return dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS] +
        [(name, sensible_clf) for name in CLF_SCORERS] +
        [(name, sensible_clf) for name in CLUSTER_SCORERS] +
        [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )
Пример #18
0
def test_multimetric_scorer_sanity_check():
    # scoring dictionary returned is the same as calling each scorer separately
    scorers = {'a1': 'accuracy', 'a2': 'accuracy',
               'll1': 'neg_log_loss', 'll2': 'neg_log_loss',
               'ra1': 'roc_auc', 'ra2': 'roc_auc'}

    X, y = make_classification(random_state=0)

    clf = DecisionTreeClassifier()
    clf.fit(X, y)

    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
    multi_scorer = _MultimetricScorer(**scorer_dict)

    result = multi_scorer(clf, X, y)

    separate_scores = {
        name: get_scorer(name)(clf, X, y)
        for name in ['accuracy', 'neg_log_loss', 'roc_auc']}

    for key, value in result.items():
        score_name = scorers[key]
        assert_allclose(value, separate_scores[score_name])
Пример #19
0
def test_errors(pyplot):
    X, y_multiclass = make_classification(n_classes=3,
                                          n_samples=50,
                                          n_informative=3,
                                          random_state=0)
    y_binary = y_multiclass == 0

    # Unfitted classifer
    binary_clf = DecisionTreeClassifier()
    with pytest.raises(NotFittedError):
        plot_precision_recall_curve(binary_clf, X, y_binary)
    binary_clf.fit(X, y_binary)

    multi_clf = DecisionTreeClassifier().fit(X, y_multiclass)

    # Fitted multiclass classifier with binary data
    msg = "DecisionTreeClassifier should be a binary classifier"
    with pytest.raises(ValueError, match=msg):
        plot_precision_recall_curve(multi_clf, X, y_binary)

    reg = DecisionTreeRegressor().fit(X, y_multiclass)
    msg = "DecisionTreeRegressor should be a binary classifier"
    with pytest.raises(ValueError, match=msg):
        plot_precision_recall_curve(reg, X, y_binary)
Пример #20
0
def test_gridsearch():
    # Check that base trees can be grid-searched.
    # AdaBoost classification
    boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
    parameters = {
        'n_estimators': (1, 2),
        'base_estimator__max_depth': (1, 2),
        'algorithm': ('SAMME', 'SAMME.R')
    }
    clf = GridSearchCV(boost, parameters)
    clf.fit(iris.data, iris.target)

    # AdaBoost regression
    boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)
    parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)}
    clf = GridSearchCV(boost, parameters)
    clf.fit(boston.data, boston.target)
Пример #21
0
def test_base_estimator():
    # Check base_estimator and its default values.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(None, n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)

    ensemble = BaggingClassifier(Perceptron(), n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_, Perceptron)

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(None, n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)

    ensemble = BaggingRegressor(SVR(), n_jobs=3,
                                random_state=0).fit(X_train, y_train)
    assert isinstance(ensemble.base_estimator_, SVR)
Пример #22
0
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert (
                    len(search(r"\.\d+", finding.group()).group()) <=
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert (len(search(r"\.\d+", finding.group()).group()) ==
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert (len(search(r"\.\d+", finding.group()).group()) ==
                             precision + 1)
Пример #23
0
def test_score_sample_weight():

    rng = np.random.RandomState(0)

    # test both ClassifierMixin and RegressorMixin
    estimators = [
        DecisionTreeClassifier(max_depth=2),
        DecisionTreeRegressor(max_depth=2)
    ]
    sets = [datasets.load_iris(), datasets.load_boston()]

    for est, ds in zip(estimators, sets):
        est.fit(ds.data, ds.target)
        # generate random sample weights
        sample_weight = rng.randint(1, 10, size=len(ds.target))
        # check that the score with and without sample weights are different
        assert (est.score(ds.data, ds.target) != est.score(
            ds.data, ds.target,
            sample_weight=sample_weight)), ("Unweighted and weighted scores "
                                            "are unexpectedly equal")
Пример #24
0
def test_thresholded_scorers_multilabel_indicator_data():
    # Test that the scorer work with multilabel-indicator format
    # for multilabel and multi-output multi-class classifier
    X, y = make_multilabel_classification(allow_unlabeled=False,
                                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Multi-output multi-class predict_proba
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
    assert_almost_equal(score1, score2)

    # Multi-output multi-class decision_function
    # TODO Is there any yet?
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    clf._predict_proba = clf.predict_proba
    clf.predict_proba = None
    clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]

    y_proba = clf.decision_function(X_test)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
    assert_almost_equal(score1, score2)

    # Multilabel predict_proba
    clf = OneVsRestClassifier(DecisionTreeClassifier())
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
    assert_almost_equal(score1, score2)

    # Multilabel decision function
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    assert_almost_equal(score1, score2)
Пример #25
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert isinstance(estimator[0].steps[-1][1].random_state, int)
Пример #26
0
def test_export_text():
    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
    clf.fit(X, y)

    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- class: -1
    |--- feature_1 >  0.00
    |   |--- class: 1
    """).lstrip()

    assert export_text(clf) == expected_report
    # testing that leaves at level 1 are not truncated
    assert export_text(clf, max_depth=0) == expected_report
    # testing that the rest of the tree is truncated
    assert export_text(clf, max_depth=10) == expected_report

    expected_report = dedent("""
    |--- b <= 0.00
    |   |--- class: -1
    |--- b >  0.00
    |   |--- class: 1
    """).lstrip()
    assert export_text(clf, feature_names=['a', 'b']) == expected_report

    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- weights: [3.00, 0.00] class: -1
    |--- feature_1 >  0.00
    |   |--- weights: [0.00, 3.00] class: 1
    """).lstrip()
    assert export_text(clf, show_weights=True) == expected_report

    expected_report = dedent("""
    |- feature_1 <= 0.00
    | |- class: -1
    |- feature_1 >  0.00
    | |- class: 1
    """).lstrip()
    assert export_text(clf, spacing=1) == expected_report

    X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
    y_l = [-1, -1, -1, 1, 1, 1, 2]
    clf = DecisionTreeClassifier(max_depth=4, random_state=0)
    clf.fit(X_l, y_l)
    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- class: -1
    |--- feature_1 >  0.00
    |   |--- truncated branch of depth 2
    """).lstrip()
    assert export_text(clf, max_depth=0) == expected_report

    X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
    y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]

    reg = DecisionTreeRegressor(max_depth=2, random_state=0)
    reg.fit(X_mo, y_mo)

    expected_report = dedent("""
    |--- feature_1 <= 0.0
    |   |--- value: [-1.0, -1.0]
    |--- feature_1 >  0.0
    |   |--- value: [1.0, 1.0]
    """).lstrip()
    assert export_text(reg, decimals=1) == expected_report
    assert export_text(reg, decimals=1, show_weights=True) == expected_report

    X_single = [[-2], [-1], [-1], [1], [1], [2]]
    reg = DecisionTreeRegressor(max_depth=2, random_state=0)
    reg.fit(X_single, y_mo)

    expected_report = dedent("""
    |--- first <= 0.0
    |   |--- value: [-1.0, -1.0]
    |--- first >  0.0
    |   |--- value: [1.0, 1.0]
    """).lstrip()
    assert export_text(reg, decimals=1,
                       feature_names=['first']) == expected_report
    assert export_text(reg, decimals=1, show_weights=True,
                       feature_names=['first']) == expected_report
Пример #27
0
def test_set_params_updates_valid_params():
    # Check that set_params tries to set SVC().C, not
    # DecisionTreeClassifier().C
    gscv = GridSearchCV(DecisionTreeClassifier(), {})
    gscv.set_params(estimator=SVC(), estimator__C=42.0)
    assert gscv.estimator.C == 42.0
Пример #28
0
def test_graphviz_toy():
    # Check correctness of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    contents1 = export_graphviz(clf, out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert contents1 == contents2

    # Test with feature_names
    contents1 = export_graphviz(clf, feature_names=["feature0", "feature1"],
                                out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert contents1 == contents2

    # Test with class_names
    contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = yes"] ;\n' \
                '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \
                'class = yes"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \
                'class = no"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert contents1 == contents2

    # Test plot_options
    contents1 = export_graphviz(clf, filled=True, impurity=False,
                                proportion=True, special_characters=True,
                                rounded=True, out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                '0 [label=<X<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>' \
                'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n' \
                '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \
                'fillcolor="#e58139"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \
                'fillcolor="#399de5"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '}'

    assert contents1 == contents2

    # Test max_depth
    contents1 = export_graphviz(clf, max_depth=0,
                                class_names=True, out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box] ;\n' \
                '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                'value = [3, 3]\\nclass = y[0]"] ;\n' \
                '1 [label="(...)"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert contents1 == contents2

    # Test max_depth with plot_options
    contents1 = export_graphviz(clf, max_depth=0, filled=True,
                                out_file=None, node_ids=True)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \
                'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n' \
                '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 1 ;\n' \
                '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
                '0 -> 2 ;\n' \
                '}'

    assert contents1 == contents2

    # Test multi-output with weighted samples
    clf = DecisionTreeClassifier(max_depth=2,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf = clf.fit(X, y2, sample_weight=w)

    contents1 = export_graphviz(clf, filled=True,
                                impurity=False, out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \
                'value = [[3.0, 1.5, 0.0]\\n' \
                '[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n' \
                '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \
                '[3, 0, 0]]", fillcolor="#e58139"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \
                'value = [[0.0, 1.5, 0.0]\\n' \
                '[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="False"] ;\n' \
                '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \
                '[0, 1, 0]]", fillcolor="#e58139"] ;\n' \
                '2 -> 3 ;\n' \
                '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \
                '[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n' \
                '2 -> 4 ;\n' \
                '}'

    assert contents1 == contents2

    # Test regression output with plot_options
    clf = DecisionTreeRegressor(max_depth=3,
                                min_samples_split=2,
                                criterion="mse",
                                random_state=2)
    clf.fit(X, y)

    contents1 = export_graphviz(clf, filled=True, leaves_parallel=True,
                                out_file=None, rotate=True, rounded=True)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled, rounded", color="black", ' \
                'fontname=helvetica] ;\n' \
                'graph [ranksep=equally, splines=polyline] ;\n' \
                'edge [fontname=helvetica] ;\n' \
                'rankdir=LR ;\n' \
                '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \
                'value = 0.0", fillcolor="#f2c09c"] ;\n' \
                '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \
                'fillcolor="#ffffff"] ;\n' \
                '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \
                'headlabel="True"] ;\n' \
                '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \
                'fillcolor="#e58139"] ;\n' \
                '0 -> 2 [labeldistance=2.5, labelangle=45, ' \
                'headlabel="False"] ;\n' \
                '{rank=same ; 0} ;\n' \
                '{rank=same ; 1; 2} ;\n' \
                '}'

    assert contents1 == contents2

    # Test classifier with degraded learning set
    clf = DecisionTreeClassifier(max_depth=3)
    clf.fit(X, y_degraded)

    contents1 = export_graphviz(clf, filled=True, out_file=None)
    contents2 = 'digraph Tree {\n' \
                'node [shape=box, style="filled", color="black"] ;\n' \
                '0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", ' \
                'fillcolor="#ffffff"] ;\n' \
                '}'
Пример #29
0
    voter = clone(voter)
    voter.fit(X, y, sample_weight=np.ones(y.shape))
    voter.set_params(lr=drop)
    with pytest.warns(None) as record:
        voter.fit(X, y, sample_weight=np.ones(y.shape))
    assert record if drop is None else not record
    y_pred = voter.predict(X)
    assert y_pred.shape == y.shape


@pytest.mark.parametrize("estimator", [
    VotingRegressor(estimators=[(
        'lr',
        LinearRegression()), ('tree', DecisionTreeRegressor(random_state=0))]),
    VotingClassifier(estimators=[('lr', LogisticRegression(
        random_state=0)), ('tree', DecisionTreeClassifier(random_state=0))])
],
                         ids=['VotingRegressor', 'VotingClassifier'])
def test_check_estimators_voting_estimator(estimator):
    # FIXME: to be removed when meta-estimators can specified themselves
    # their testing parameters (for required parameters).
    check_estimator(estimator)
    check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)


# TODO: Remove in 0.24 when None is removed in Voting*
@pytest.mark.parametrize("Voter, BaseEstimator",
                         [(VotingClassifier, DecisionTreeClassifier),
                          (VotingRegressor, DecisionTreeRegressor)])
def test_deprecate_none_transformer(Voter, BaseEstimator):
    est = Voter(estimators=[('lr',