Пример #1
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(FunctionTransformer(replace), classifier)
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert y.shape == y_hat.shape
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    with pytest.raises(ValueError):
        pipeline.fit(X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    with pytest.raises(ValueError):
        bagging_classifier.fit(X, y)
Пример #2
0
def test_bagging_classifier_with_missing_inputs():
    # Check that BaggingClassifier can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y = np.array([3, 6, 6, 6, 6])
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(
        FunctionTransformer(replace, validate=False),
        classifier
    )
    pipeline.fit(X, y).predict(X)
    bagging_classifier = BaggingClassifier(pipeline)
    bagging_classifier.fit(X, y)
    y_hat = bagging_classifier.predict(X)
    assert_equal(y.shape, y_hat.shape)
    bagging_classifier.predict_log_proba(X)
    bagging_classifier.predict_proba(X)

    # Verify that exceptions can be raised by wrapper classifier
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(classifier)
    assert_raises(ValueError, pipeline.fit, X, y)
    bagging_classifier = BaggingClassifier(pipeline)
    assert_raises(ValueError, bagging_classifier.fit, X, y)
Пример #3
0
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Пример #4
0
def test_probability():
    # Predict probabilities.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                     random_state=rng).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BaggingClassifier(base_estimator=LogisticRegression(),
                                     random_state=rng,
                                     max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Пример #5
0
class BaggedDecisionTreeClassifier():
    def __init__(self,
                 n_estimators=20,
                 bootstrap=True,
                 bootstrap_features=False,
                 oob_score=False,
                 max_depth=None,
                 min_samples_leaf=20,
                 warm_start=False,
                 n_jobs=None,
                 early_stopping='auto',
                 verbose=0,
                 random_state=None):
        self.tree = DecisionTreeClassifier(max_depth=max_depth,
                                           min_samples_leaf=min_samples_leaf)
        self.BagDT = BaggingClassifier(base_estimator=self.tree,
                                       n_estimators=n_estimators,
                                       bootstrap=bootstrap,
                                       bootstrap_features=bootstrap_features,
                                       oob_score=oob_score,
                                       warm_start=warm_start,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       verbose=verbose)

    def decision_function(self, X):
        return self.BagDT.decision_function(X)

    def fit(self, X, y, sample_weight=None):
        self.BagDT.fit(X, y, sample_weight=sample_weight)
        return self.BagDT

    def get_params(self, deep=True):
        return self.BagDT.get_params(deep=deep)

    def predict(self, X):
        return self.BagDT.predict(X)

    def predict_log_proba(self, X):
        return self.BagDT.predict_log_proba(X)

    def predict_proba(self, X):
        return self.BagDT.predict_proba(X)

    def score(self, X, y, sample_weight=None):
        return self.BagDT.score(X, y, sample_weight=sample_weight)

    def set_params(self, **params):
        return self.BagDT.set_params(**params)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=214)
    # 创建模型对象
    dt = DecisionTreeClassifier(criterion='gini', max_depth=5)
    algo = BaggingClassifier(base_estimator=dt,
                             n_estimators=10,
                             oob_score=True)
    # 模型训练
    algo.fit(X_train, y_train)
    # 7. 模型效果评估
    print('训练集上数据准确率:{}'.format(algo.score(X_train, y_train)))
    print('测试集上数据准确率:{}'.format(algo.score(X_test, y_test)))
    # 8. 看下属性API
    X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本预测值:')
    print(algo.predict(X_test))
    print('样本的预测概率:')
    print(algo.predict_proba(X_test))
    print('样本预测概率值得log转换值:')
    print(algo.predict_log_proba(X_test))

    print('训练好的所有子模型:\n{}'.format(algo.estimators_))
    X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    for k, estimators in enumerate(algo.estimators_):
        print('第{}个子模型对于数据的预测值为:{}'.format(k + 1, estimators.predict(X_test)))
    print('各个子模型的训练数据使用的特征属性:{}'.format(algo.estimators_features_))
    print('Bagging模型的袋外准确率:{}'.format(algo.oob_score_))

    # 所有子模型可视化
Пример #7
0
class _BaggingClassifierImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=10,
        *,
        max_samples=1.0,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=False,
        oob_score=False,
        warm_start=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        estimator_impl = base_estimator

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "max_samples": max_samples,
            "max_features": max_features,
            "bootstrap": bootstrap,
            "bootstrap_features": bootstrap_features,
            "oob_score": oob_score,
            "warm_start": warm_start,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "verbose": verbose,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y, sample_weight=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = (
                feature_transformer >> self._hyperparams["base_estimator"])
            self._wrapped_model = SKLModel(**self._hyperparams)
        self._wrapped_model.fit(X, y, sample_weight)

        return self

    def predict(self, X, **predict_params):
        return self._wrapped_model.predict(X, **predict_params)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def predict_log_proba(self, X):
        return self._wrapped_model.predict_log_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
Пример #8
0
class HistRandomForestClassifier():
    def __init__(self,
                 loss='auto',
                 max_leaf_nodes=31,
                 max_depth=None,
                 min_samples_leaf=20,
                 l2_regularization=0,
                 max_bins=255,
                 n_estimators=20,
                 max_samples=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 oob_score=False,
                 categorical_features=None,
                 monotonic_cst=None,
                 warm_start=False,
                 n_jobs=None,
                 early_stopping='auto',
                 scoring='loss',
                 validation_fraction=0.1,
                 n_iter_no_change=10,
                 tol=1e-7,
                 verbose=0,
                 random_state=None):
        self.loss = loss
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.l2_regularization = l2_regularization
        self.max_bins = max_bins
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.oob_score = oob_score
        self.categorical_features = categorical_features
        self.monotonic_cst = monotonic_cst
        self.warm_start = warm_start
        self.n_jobs = n_jobs
        self.early_stopping = early_stopping
        self.scoring = scoring
        self.validation_fraction = validation_fraction
        self.n_iter_no_change = n_iter_no_change
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

        self.tree = HistGradientBoostingClassifier(
            loss=loss,
            learning_rate=1,
            max_iter=1,
            max_leaf_nodes=max_leaf_nodes,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            l2_regularization=l2_regularization,
            max_bins=max_bins,
            categorical_features=categorical_features,
            monotonic_cst=monotonic_cst,
            early_stopping=early_stopping,
            scoring=scoring,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
            verbose=verbose,
            random_state=random_state)
        self.HistRF = BaggingClassifier(base_estimator=self.tree,
                                        n_estimators=n_estimators,
                                        bootstrap=bootstrap,
                                        bootstrap_features=bootstrap_features,
                                        oob_score=oob_score,
                                        warm_start=warm_start,
                                        n_jobs=n_jobs,
                                        random_state=random_state,
                                        verbose=verbose)

    def decision_function(self, X):
        return self.HistRF.decision_function(X)

    def fit(self, X, y, sample_weight=None):
        self.HistRF.fit(X, y, sample_weight=sample_weight)
        return self.HistRF

    def get_params(self, deep=True):
        return self.HistRF.get_params(deep=deep)

    def predict(self, X):
        return self.HistRF.predict(X)

    def predict_log_proba(self, X):
        return self.HistRF.predict_log_proba(X)

    def predict_proba(self, X):
        return self.HistRF.predict_proba(X)

    def score(self, X, y, sample_weight=None):
        return self.HistRF.score(X, y, sample_weight=sample_weight)

    def set_params(self, **params):
        return self.HistRF.set_params(**params)