def test_dirichlet_switch(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf1 = ForestClassifier(class_weight="balanced", random_state=42) clf2 = ForestClassifier(class_weight="balanced", random_state=42, dirichlet=2.0) clf1.fit(X_train, y_train) clf2.fit(X_train, y_train) y_score1 = clf1.predict_proba(X_test) y_score2 = clf2.predict_proba(X_test) assert np.max(np.abs(y_score1 - y_score2)) >= 0.01 clf2.dirichlet = 0.5 y_score2 = clf2.predict_proba(X_test) assert y_score1 == pytest.approx(y_score2, abs=1e-5) clf1.dirichlet = 1.1 clf2.dirichlet = 1.1 y_score1 = clf1.predict_proba(X_test) y_score2 = clf2.predict_proba(X_test) assert y_score1 == pytest.approx(y_score2, abs=1e-5)
def test_performance_breast_cancer(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(class_weight="balanced", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(class_weight="balanced", random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
def test_performance_iris(self): iris = self.iris X, y = iris["data"], iris["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(class_weight="balanced", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(class_weight="balanced", random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
def test_cat_split_strategy_on_car(self): dataset = load_car() dataset.one_hot_encode = False dataset.test_size = 1.0 / 5 random_state = 42 X_train, X_test, y_train, y_test = dataset.extract( random_state=random_state) n_estimators = 1 aggregation = False class_weight = "balanced" n_jobs = 1 max_features = None random_state = 42 dirichlet = 0.0 categorical_features = dataset.categorical_features_ multiclass = "multinomial" cat_split_strategy = "binary" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, cat_split_strategy=cat_split_strategy, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) lloss_train_binary = log_loss(y_train, y_scores_train) lloss_test_binary = log_loss(y_test, y_scores_test) multiclass = "multinomial" cat_split_strategy = "all" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, cat_split_strategy=cat_split_strategy, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) lloss_train_all = log_loss(y_train, y_scores_train) lloss_test_all = log_loss(y_test, y_scores_test) assert lloss_train_all < lloss_train_binary assert lloss_test_all < lloss_test_binary
def do_test_bootstrap(n_estimators, n_jobs, random_state): # 1. Test that all bootstrap samples are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf.fit(X, y) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices != approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices != approx( clf.trees[n_estimator2]._valid_indices) # 2. Test that random_seed makes bootstrap samples identical and that # when no random_seed is used bootstrap samples are different clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) for n_estimator in range(n_estimators): if random_state is None: assert clf1.trees[n_estimator]._train_indices != approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices != approx( clf2.trees[n_estimator]._valid_indices) else: assert clf1.trees[n_estimator]._train_indices == approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices == approx( clf2.trees[n_estimator]._valid_indices) # 3. Test that the apply() method gives the exact same leaves (this allows # to check that the trees are the same, namely that random columns # subsampling is indeed correctly seeded) and that predictions are the # same (or not) clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) if random_state is None: assert clf1.apply(X) != approx(clf2.apply(X)) assert clf1.predict_proba(X) != approx(clf2.predict_proba(X)) else: assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
def test_ovr_with_two_classes(self): """Test on a binary classification problem that 'ovr' and 'multiclass' are exactly identical""" dataset = self.adult dataset.one_hot_encode = False random_state = 42 X_train, X_test, y_train, y_test = dataset.extract( random_state=random_state) n_estimators = 2 aggregation = False class_weight = "balanced" n_jobs = -1 max_features = None dirichlet = 0.0 categorical_features = dataset.categorical_features_ multiclass = "multinomial" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_test1 = clf.predict_proba(X_test) multiclass = "ovr" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_test2 = clf.predict_proba(X_test) assert y_scores_test1 == approx(y_scores_test2)
def test_several_max_bins_for_classification(loader, is_categorical_, required_log_loss, max_bins, aggregation): X, y = loader(raw=True) random_state = 42 X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) n_estimators = 10 class_weight = "balanced" n_jobs = -1 dirichlet = 1e-2 clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, aggregation=aggregation, max_bins=max_bins, dirichlet=dirichlet, class_weight=class_weight, random_state=random_state, ) clf.fit(X_train, y_train) np.testing.assert_equal(clf.is_categorical_, is_categorical_) y_scores_test = clf.predict_proba(X_test) assert log_loss(y_test, y_scores_test) < required_log_loss
def test_performance_cat_split_strategy_iris(self): iris = self.iris X, y = iris["data"], iris["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(cat_split_strategy="all", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(cat_split_strategy="random", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
def test_performance_cat_split_strategy_breast_cancer(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(cat_split_strategy="all", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 # all# clf = ForestClassifier(cat_split_strategy="random", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 # random#
def test_forest_classifier_serialization( dataset_name, n_estimators, aggregation, class_weight, dirichlet, n_jobs, max_features, random_state, step, multiclass, cat_split_strategy, ): if dataset_name == "adult": X, y = load_adult(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) elif dataset_name == "iris": iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1 / 5, random_state=random_state) clf1 = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, max_bins=37, cat_split_strategy=cat_split_strategy, aggregation=aggregation, max_features=max_features, class_weight=class_weight, random_state=random_state, dirichlet=dirichlet, step=step, ) clf1.fit(X_train, y_train) filename = "forest_classifier_on_iris.pkl" with open(filename, "wb") as f: pkl.dump(clf1, f) with open(filename, "rb") as f: clf2 = pkl.load(f) os.remove(filename) assert_forests_equal(clf1, clf2, is_classifier=True) y_pred1 = clf1.predict_proba(X_test) y_pred2 = clf2.predict_proba(X_test) np.testing.assert_equal(y_pred1, y_pred2) y_pred1 = clf1.predict(X_test) y_pred2 = clf2.predict(X_test) np.testing.assert_equal(y_pred1, y_pred2) apply1 = clf1.apply(X_test) apply2 = clf2.apply(X_test) np.testing.assert_equal(apply1, apply2)
def test_class_weight_sample_weights(self): iris = self.iris X, y = iris["data"], iris["target"] # Check that no sample_weight and all sample weights equal to 1. is the same clf1 = ForestClassifier(class_weight=None, random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) clf2.fit(X, y, sample_weight=np.ones(y.shape[0])) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) clf1 = ForestClassifier(class_weight="balanced", random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) sample_weight = compute_sample_weight("balanced", y) clf2.fit(X, y, sample_weight=sample_weight) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) # Simulate unbalanced data from the iris datasets X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0) y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report1 = classification_report(y_test, y_scores, output_dict=True) clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report2 = classification_report(y_test, y_scores, output_dict=True) # In the considered case, class_weight should improve all metrics for label in ["0", "1", "2"]: label_report1 = report1[label] label_report2 = report2[label] assert label_report2["precision"] >= label_report1["precision"] assert label_report2["recall"] >= label_report1["recall"] assert label_report2["f1-score"] >= label_report1["f1-score"] breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] idx_0 = y == 0 idx_1 = y == 1 X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0) y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) y_test_binary = LabelBinarizer().fit_transform(y_test) avg_prec1 = average_precision_score(y_test_binary, y_scores, average="weighted") clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) avg_prec2 = average_precision_score(y_test_binary, y_scores, average="weighted") assert avg_prec2 > avg_prec1