def test_importance(self, iris_X, iris_y, importance, scale_permutation_importance, local_importance):
        rfc = RangerForestClassifier(
            importance=importance,
            scale_permutation_importance=scale_permutation_importance,
            local_importance=local_importance,
        )

        if importance not in ["none", "impurity", "impurity_corrected", "permutation"]:
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
            return

        rfc.fit(iris_X, iris_y)
        if importance == "none":
            assert rfc.importance_mode_ == 0
        elif importance == "impurity":
            assert rfc.importance_mode_ == 1
        elif importance == "impurity_corrected":
            assert rfc.importance_mode_ == 5
        elif importance == "permutation":
            if local_importance:
                assert rfc.importance_mode_ == 6
            elif scale_permutation_importance:
                assert rfc.importance_mode_ == 2
            else:
                assert rfc.importance_mode_ == 3
    def test_split_rule(self, iris_X, iris_y, split_rule):
        rfc = RangerForestClassifier(split_rule=split_rule)

        if split_rule not in ["gini", "extratrees", "hellinger"]:
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
            return

        # hellinger can only be used in binary classification
        if split_rule == "hellinger":
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)

        iris_2 = [0 if v == 2 else v for v in iris_y]
        rfc.fit(iris_X, iris_2)

        if split_rule == "gini":
            assert rfc.split_rule_ == 1
        elif split_rule == "extratrees":
            assert rfc.split_rule_ == 5
        if split_rule == "hellinger":
            assert rfc.split_rule_ == 7

        if split_rule == "extratrees":
            rfc = RangerForestClassifier(
                split_rule=split_rule, respect_categorical_features="partition", save_memory=True
            )
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
        else:
            rfc = RangerForestClassifier(split_rule=split_rule, num_random_splits=2)
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
    def test_categorical_features(self, iris_X, iris_y,
                                  respect_categorical_features):
        # add a categorical feature
        categorical_col = np.atleast_2d(
            np.array([random.choice([0, 1]) for _ in range(iris_X.shape[0])]))
        iris_X_c = np.hstack((iris_X, categorical_col.transpose()))
        categorical_features = [iris_X.shape[1]]

        rfc = RangerForestClassifier(
            respect_categorical_features=respect_categorical_features,
            categorical_features=categorical_features,
        )

        if respect_categorical_features not in [
                "partition", "ignore", "order"
        ]:
            with pytest.raises(ValueError):
                rfc.fit(iris_X_c, iris_y)
            return

        rfc.fit(iris_X_c, iris_y)

        if respect_categorical_features in ("ignore", "order"):
            assert rfc.categorical_features_ == []
        else:
            assert rfc.categorical_features_ == [
                str(c).encode() for c in categorical_features
            ]
 def test_verbose(self, iris_X, iris_y, verbose, capfd):
     rfc = RangerForestClassifier(verbose=verbose)
     rfc.fit(iris_X, iris_y)
     captured = capfd.readouterr()
     if verbose:
         assert len(captured.out) > 0
     else:
         assert len(captured.out) == 0
    def test_sample_fraction_replace(self, iris_X, iris_y, replace):
        rfc = RangerForestClassifier(replace=replace)
        rfc.fit(iris_X, iris_y)

        if replace:
            assert rfc.sample_fraction_ == [1.0]
        else:
            assert rfc.sample_fraction_ == [0.632]
 def test_serialize(self, iris_X, iris_y):
     tf = tempfile.TemporaryFile()
     rfc = RangerForestClassifier()
     rfc.fit(iris_X, iris_y)
     pickle.dump(rfc, tf)
     tf.seek(0)
     new_rfc = pickle.load(tf)
     pred = new_rfc.predict(iris_X)
     assert len(pred) == iris_X.shape[0]
    def test_predict_log_proba(self, iris_X, iris_y):
        rfc = RangerForestClassifier()
        rfc.fit(iris_X, iris_y)
        pred = rfc.predict_log_proba(iris_X)
        assert len(pred) == iris_X.shape[0]

        # test with single record
        iris_X_record = iris_X[0:1, :]
        pred = rfc.predict_log_proba(iris_X_record)
        assert len(pred) == 1
Пример #8
0
def test_shap_classifier(iris_X, iris_y):
    from shap import TreeExplainer

    forest = RangerForestClassifier(enable_tree_details=True)
    forest.fit(iris_X, iris_y)

    with shap_patch():
        explainer = TreeExplainer(model=forest)
    shap_values = explainer.shap_values(iris_X)
    print(shap_values)
Пример #9
0
    def test_predict(self, iris_X, iris_y):
        forest = RangerForestClassifier()
        forest.fit(iris_X, iris_y)
        pred = forest.predict(iris_X)
        assert len(pred) == iris_X.shape[0]

        # test with single record
        iris_X_record = iris_X[0:1, :]
        pred = forest.predict(iris_X_record)
        assert len(pred) == 1
 def test_fit(self, iris_X, iris_y):
     rfc = RangerForestClassifier()
     with pytest.raises(NotFittedError):
         check_is_fitted(rfc)
     rfc.fit(iris_X, iris_y)
     check_is_fitted(rfc)
     assert hasattr(rfc, "classes_")
     assert hasattr(rfc, "n_classes_")
     assert hasattr(rfc, "ranger_forest_")
     assert hasattr(rfc, "ranger_class_order_")
     assert hasattr(rfc, "n_features_in_")
    def test_regularization(self, iris_X, iris_y):
        rfc = RangerForestClassifier()
        rfc.fit(iris_X, iris_y)
        assert rfc.regularization_factor_ == []
        assert not rfc.use_regularization_factor_

        # vector must be between 0 and 1 and length matching feature num
        for r in [[1.1], [-0.1], [1, 1]]:
            rfc = RangerForestClassifier(regularization_factor=r)
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)

        # vector of ones isn't applied
        rfc = RangerForestClassifier(regularization_factor=[1] * iris_X.shape[1])
        rfc.fit(iris_X, iris_y)
        assert rfc.regularization_factor_ == []
        assert not rfc.use_regularization_factor_

        # regularization vector is used
        reg = [0.5]
        rfc = RangerForestClassifier(regularization_factor=reg, n_jobs=2)
        # warns if n_jobs is not one since parallelization can't be used
        with pytest.warns(Warning):
            rfc.fit(iris_X, iris_y)
        assert rfc.n_jobs_ == 1
        assert rfc.regularization_factor_ == reg
        assert rfc.use_regularization_factor_
    def test_importance_pvalues(self, iris_X_mod, iris_y, importance, mod):
        rfc = RangerForestClassifier(importance=importance)
        np.random.seed(42)

        if importance not in [
                "none", "impurity", "impurity_corrected", "permutation"
        ]:
            with pytest.raises(ValueError):
                rfc.fit(iris_X_mod, iris_y)
            return

        if not importance == "impurity_corrected":
            rfc.fit(iris_X_mod, iris_y)
            with pytest.raises(ValueError):
                rfc.get_importance_pvalues()
            return

        # Test error for no non-negative importance values
        if mod == "none":
            rfc.fit(iris_X_mod, iris_y)
            with pytest.raises(ValueError):
                rfc.get_importance_pvalues()
            return

        rfc.fit(iris_X_mod, iris_y)
        assert len(rfc.get_importance_pvalues()) == iris_X_mod.shape[1]
Пример #13
0
 def test_estimators_(self, iris_X, iris_y):
     forest = RangerForestClassifier(n_estimators=10)
     with pytest.raises(AttributeError):
         _ = forest.estimators_
     forest.fit(iris_X, iris_y)
     with pytest.raises(ValueError):
         _ = forest.estimators_
     forest = RangerForestClassifier(n_estimators=10,
                                     enable_tree_details=True)
     forest.fit(iris_X, iris_y)
     estimators = forest.estimators_
     assert len(estimators) == 10
     assert isinstance(estimators[0], RangerTreeClassifier)
     check_is_fitted(estimators[0])
Пример #14
0
 def test_get_estimator(self, iris_X, iris_y):
     forest = RangerForestClassifier(n_estimators=10)
     with pytest.raises(NotFittedError):
         _ = forest.get_estimator(idx=0)
     forest.fit(iris_X, iris_y)
     with pytest.raises(ValueError):
         _ = forest.get_estimator(0)
     forest = RangerForestClassifier(n_estimators=10,
                                     enable_tree_details=True)
     forest.fit(iris_X, iris_y)
     estimator = forest.get_estimator(0)
     estimator.predict(iris_X)
     assert isinstance(estimator, RangerTreeClassifier)
     with pytest.raises(IndexError):
         _ = forest.get_estimator(idx=20)
    def test_sample_fraction(self, iris_X, iris_y):
        rfc = RangerForestClassifier(sample_fraction=[0.69])
        rfc.fit(iris_X, iris_y)
        assert rfc.sample_fraction_ == [0.69]
        rfc = RangerForestClassifier(sample_fraction=0.69)
        rfc.fit(iris_X, iris_y)
        assert rfc.sample_fraction_ == [0.69]

        # test with single record
        iris_X_record = iris_X[0:1, :]
        pred = rfc.predict(iris_X_record)
        assert len(pred) == 1
        pred = rfc.predict_proba(iris_X_record)
        assert len(pred) == 1
        pred = rfc.predict_log_proba(iris_X_record)
        assert len(pred) == 1
    def test_feature_importances_(self, iris_X, iris_y, importance, local_importance):
        rfc = RangerForestClassifier(importance=importance, local_importance=local_importance)
        with pytest.raises(AttributeError):
            _ = rfc.feature_importances_

        if importance == "INVALID":
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
            return

        rfc.fit(iris_X, iris_y)
        if importance == "none":
            with pytest.raises(ValueError):
                _ = rfc.feature_importances_
        else:
            assert len(rfc.feature_importances_) == iris_X.shape[1]
    def test_accuracy(self, iris_X, iris_y):
        X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.33, random_state=42)

        # train and test a random forest classifier
        rf = RandomForestClassifier()
        rf.fit(X_train, y_train)
        y_pred_rf = rf.predict(X_test)
        rf_acc = accuracy_score(y_test, y_pred_rf)

        # train and test a ranger classifier
        ra = RangerForestClassifier()
        ra.fit(X_train, y_train)
        y_pred_ra = ra.predict(X_test)
        ranger_acc = accuracy_score(y_test, y_pred_ra)

        # the accuracy should be good
        assert rf_acc > 0.9
        assert ranger_acc > 0.9
Пример #18
0
    def test_class_weights(self, iris_X, iris_y):
        X_train, X_test, y_train, y_test = train_test_split(iris_X,
                                                            iris_y,
                                                            test_size=0.5,
                                                            random_state=42)
        forest = RangerForestClassifier()
        weights = {
            0: 0.7,
            1: 0.2,
            2: 0.1,
        }
        forest.fit(X_train, y_train, class_weights=weights)
        forest.predict(X_test)

        forest = RangerForestClassifier()
        m = {0: "a", 1: "b", 2: "c"}
        y_train_str = [m.get(v) for v in y_train]
        weights = {
            "a": 0.7,
            "b": 0.2,
            "c": 0.1,
        }
        forest.fit(X_train, y_train_str, class_weights=weights)
        forest.predict(X_test)

        weights = {
            0: 0.7,
        }
        with pytest.raises(ValueError):
            forest.fit(X_train, y_train, class_weights=weights)
Пример #19
0
    def test_categorical_features(self, iris_X, iris_y,
                                  respect_categorical_features):
        # add a categorical feature
        categorical_col = np.atleast_2d(
            np.array([random.choice([0, 1]) for _ in range(iris_X.shape[0])]))
        iris_X_c = np.hstack((iris_X, categorical_col.transpose()))
        categorical_features = [iris_X.shape[1]]

        forest = RangerForestClassifier(
            respect_categorical_features=respect_categorical_features, )

        if respect_categorical_features not in [
                "partition", "ignore", "order"
        ]:
            with pytest.raises(ValueError):
                forest.fit(iris_X_c,
                           iris_y,
                           categorical_features=categorical_features)
            return

        forest.fit(iris_X_c, iris_y, categorical_features=categorical_features)
        forest.predict(iris_X_c)
    def test_inbag(self, iris_X, iris_y):
        inbag = [[1, 2, 3], [2, 3, 4]]
        rfc = RangerForestClassifier(n_estimators=2, inbag=inbag)
        rfc.fit(iris_X, iris_y)

        # inbag list different length from n_estimators
        rfc = RangerForestClassifier(n_estimators=1, inbag=inbag)
        with pytest.raises(ValueError):
            rfc.fit(iris_X, iris_y)

        # can't use inbag with sample weight
        rfc = RangerForestClassifier(inbag=inbag)
        with pytest.raises(ValueError):
            rfc.fit(iris_X, iris_y, sample_weight=[1] * len(iris_y))

        # can't use class sampling and inbag
        rfc = RangerForestClassifier(inbag=inbag, sample_fraction=[1, 1])
        with pytest.raises(ValueError):
            rfc.fit(iris_X, iris_y)
Пример #21
0
    def test_split_select_weights(self, iris_X, iris_y):
        n_trees = 10
        weights = [0.1] * iris_X.shape[1]
        forest = RangerForestClassifier(n_estimators=n_trees)
        forest.fit(iris_X, iris_y, split_select_weights=weights)

        weights = [0.1] * (iris_X.shape[1] - 1)
        forest = RangerForestClassifier(n_estimators=n_trees)

        with pytest.raises(RuntimeError):
            forest.fit(iris_X, iris_y, split_select_weights=weights)

        weights = [[0.1] * (iris_X.shape[1])] * n_trees
        forest = RangerForestClassifier(n_estimators=n_trees)
        forest.fit(iris_X, iris_y, split_select_weights=weights)

        weights = [[0.1] * (iris_X.shape[1])] * (n_trees + 1)
        forest = RangerForestClassifier(n_estimators=n_trees)
        with pytest.raises(RuntimeError):
            forest.fit(iris_X, iris_y, split_select_weights=weights)
    def test_mtry(self, iris_X, iris_y, mtry):
        rfc = RangerForestClassifier(mtry=mtry)

        if callable(mtry) and mtry(5) > 5:
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
            return
        elif not callable(mtry) and (mtry < 0 or mtry > iris_X.shape[0]):
            with pytest.raises(ValueError):
                rfc.fit(iris_X, iris_y)
            return

        rfc.fit(iris_X, iris_y)
        if callable(mtry):
            assert rfc.mtry_ == mtry(iris_X.shape[1])
        else:
            assert rfc.mtry_ == mtry
 def test_clone(self, iris_X, iris_y):
     rfc = RangerForestClassifier()
     rfc.fit(iris_X, iris_y)
     clone(rfc)
Пример #24
0
 def test_clone(self, iris_X, iris_y):
     forest = RangerForestClassifier()
     forest.fit(iris_X, iris_y)
     clone(forest)
Пример #25
0
def RangerForestIMPL(train_embeddings, train_labels,test_embeddings,test_labels,single_label=True):
    rfc = RangerForestClassifier()
    rfc.fit(train_embeddings, train_labels)
    predsranger = rfc.predict(test_embeddings)
    if single_label==True:
        return single_label_metrics(test_labels,predsranger)
 def test_always_split_features(self, iris_X, iris_y):
     rfc = RangerForestClassifier(always_split_features=[0])
     rfc.fit(iris_X, iris_y)
     # feature 0 is in every tree split
     for tree in rfc.ranger_forest_["forest"]["split_var_ids"]:
         assert 0 in tree