Пример #1
0
    def test_dirichlet_switch(self):
        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf2 = ForestClassifier(class_weight="balanced",
                                random_state=42,
                                dirichlet=2.0)

        clf1.fit(X_train, y_train)
        clf2.fit(X_train, y_train)
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)

        assert np.max(np.abs(y_score1 - y_score2)) >= 0.01
        clf2.dirichlet = 0.5
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)

        clf1.dirichlet = 1.1
        clf2.dirichlet = 1.1
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)
Пример #2
0
 def test_performance_breast_cancer(self):
     breast_cancer = self.breast_cancer
     X, y = breast_cancer["data"], breast_cancer["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(class_weight="balanced", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(class_weight="balanced",
                            random_state=42,
                            criterion="entropy")
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(random_state=42, criterion="entropy")
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
Пример #3
0
    def test_performance_iris(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)
        clf = ForestClassifier(class_weight="balanced", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42, criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
Пример #4
0
    def test_cat_split_strategy_on_car(self):
        dataset = load_car()
        dataset.one_hot_encode = False
        dataset.test_size = 1.0 / 5
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)
        n_estimators = 1
        aggregation = False
        class_weight = "balanced"
        n_jobs = 1
        max_features = None
        random_state = 42
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        cat_split_strategy = "binary"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_binary = log_loss(y_train, y_scores_train)
        lloss_test_binary = log_loss(y_test, y_scores_test)

        multiclass = "multinomial"
        cat_split_strategy = "all"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_all = log_loss(y_train, y_scores_train)
        lloss_test_all = log_loss(y_test, y_scores_test)

        assert lloss_train_all < lloss_train_binary
        assert lloss_test_all < lloss_test_binary
Пример #5
0
        def do_test_bootstrap(n_estimators, n_jobs, random_state):
            # 1. Test that all bootstrap samples are different
            clf = ForestClassifier(n_estimators=n_estimators,
                                   n_jobs=n_jobs,
                                   random_state=random_state)
            clf.fit(X, y)

            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices != approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices != approx(
                        clf.trees[n_estimator2]._valid_indices)

            # 2. Test that random_seed makes bootstrap samples identical and that
            #    when no random_seed is used bootstrap samples are different
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            for n_estimator in range(n_estimators):
                if random_state is None:
                    assert clf1.trees[n_estimator]._train_indices != approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices != approx(
                        clf2.trees[n_estimator]._valid_indices)
                else:
                    assert clf1.trees[n_estimator]._train_indices == approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices == approx(
                        clf2.trees[n_estimator]._valid_indices)

            # 3. Test that the apply() method gives the exact same leaves (this allows
            #    to check that the trees are the same, namely that random columns
            #    subsampling is indeed correctly seeded) and that predictions are the
            #    same (or not)
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            if random_state is None:
                assert clf1.apply(X) != approx(clf2.apply(X))
                assert clf1.predict_proba(X) != approx(clf2.predict_proba(X))
            else:
                assert clf1.apply(X) == approx(clf2.apply(X))
                assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
Пример #6
0
    def test_ovr_with_two_classes(self):
        """Test on a binary classification problem that 'ovr' and 'multiclass' are
        exactly identical"""
        dataset = self.adult
        dataset.one_hot_encode = False
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)

        n_estimators = 2
        aggregation = False
        class_weight = "balanced"
        n_jobs = -1
        max_features = None
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test1 = clf.predict_proba(X_test)

        multiclass = "ovr"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test2 = clf.predict_proba(X_test)

        assert y_scores_test1 == approx(y_scores_test2)
Пример #7
0
def test_several_max_bins_for_classification(loader, is_categorical_,
                                             required_log_loss, max_bins,
                                             aggregation):
    X, y = loader(raw=True)
    random_state = 42
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=random_state)
    n_estimators = 10
    class_weight = "balanced"
    n_jobs = -1
    dirichlet = 1e-2

    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        aggregation=aggregation,
        max_bins=max_bins,
        dirichlet=dirichlet,
        class_weight=class_weight,
        random_state=random_state,
    )
    clf.fit(X_train, y_train)
    np.testing.assert_equal(clf.is_categorical_, is_categorical_)
    y_scores_test = clf.predict_proba(X_test)
    assert log_loss(y_test, y_scores_test) < required_log_loss
Пример #8
0
 def test_performance_cat_split_strategy_iris(self):
     iris = self.iris
     X, y = iris["data"], iris["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(cat_split_strategy="all", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
     clf = ForestClassifier(cat_split_strategy="random", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
Пример #9
0
    def test_performance_cat_split_strategy_breast_cancer(self):
        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)

        clf = ForestClassifier(cat_split_strategy="all", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98  # all#
        clf = ForestClassifier(cat_split_strategy="random", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98  # random#
Пример #10
0
def test_forest_classifier_serialization(
    dataset_name,
    n_estimators,
    aggregation,
    class_weight,
    dirichlet,
    n_jobs,
    max_features,
    random_state,
    step,
    multiclass,
    cat_split_strategy,
):
    if dataset_name == "adult":
        X, y = load_adult(raw=True)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=random_state)
    elif dataset_name == "iris":
        iris = datasets.load_iris()
        X = iris.data
        y = iris.target
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=1 / 5, random_state=random_state)

    clf1 = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        max_bins=37,
        cat_split_strategy=cat_split_strategy,
        aggregation=aggregation,
        max_features=max_features,
        class_weight=class_weight,
        random_state=random_state,
        dirichlet=dirichlet,
        step=step,
    )
    clf1.fit(X_train, y_train)

    filename = "forest_classifier_on_iris.pkl"
    with open(filename, "wb") as f:
        pkl.dump(clf1, f)

    with open(filename, "rb") as f:
        clf2 = pkl.load(f)

    os.remove(filename)

    assert_forests_equal(clf1, clf2, is_classifier=True)

    y_pred1 = clf1.predict_proba(X_test)
    y_pred2 = clf2.predict_proba(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    apply1 = clf1.apply(X_test)
    apply2 = clf2.apply(X_test)
    np.testing.assert_equal(apply1, apply2)
Пример #11
0
    def test_class_weight_sample_weights(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        # Check that no sample_weight and all sample weights equal to 1. is the same
        clf1 = ForestClassifier(class_weight=None, random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        clf2.fit(X, y, sample_weight=np.ones(y.shape[0]))
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        sample_weight = compute_sample_weight("balanced", y)
        clf2.fit(X, y, sample_weight=sample_weight)
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        # Simulate unbalanced data from the iris datasets
        X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0)
        y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report1 = classification_report(y_test, y_scores, output_dict=True)

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report2 = classification_report(y_test, y_scores, output_dict=True)

        # In the considered case, class_weight should improve all metrics
        for label in ["0", "1", "2"]:
            label_report1 = report1[label]
            label_report2 = report2[label]
            assert label_report2["precision"] >= label_report1["precision"]
            assert label_report2["recall"] >= label_report1["recall"]
            assert label_report2["f1-score"] >= label_report1["f1-score"]

        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        idx_0 = y == 0
        idx_1 = y == 1

        X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0)
        y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)

        y_test_binary = LabelBinarizer().fit_transform(y_test)

        avg_prec1 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        avg_prec2 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        assert avg_prec2 > avg_prec1