def do_test_bootstrap(n_estimators, n_jobs, random_state): # 1. Test that all bootstrap samples are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf.fit(X, y) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices != approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices != approx( clf.trees[n_estimator2]._valid_indices) # 2. Test that random_seed makes bootstrap samples identical and that # when no random_seed is used bootstrap samples are different clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) for n_estimator in range(n_estimators): if random_state is None: assert clf1.trees[n_estimator]._train_indices != approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices != approx( clf2.trees[n_estimator]._valid_indices) else: assert clf1.trees[n_estimator]._train_indices == approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices == approx( clf2.trees[n_estimator]._valid_indices) # 3. Test that the apply() method gives the exact same leaves (this allows # to check that the trees are the same, namely that random columns # subsampling is indeed correctly seeded) and that predictions are the # same (or not) clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) if random_state is None: assert clf1.apply(X) != approx(clf2.apply(X)) assert clf1.predict_proba(X) != approx(clf2.predict_proba(X)) else: assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
def test_forest_classifier_serialization( dataset_name, n_estimators, aggregation, class_weight, dirichlet, n_jobs, max_features, random_state, step, multiclass, cat_split_strategy, ): if dataset_name == "adult": X, y = load_adult(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) elif dataset_name == "iris": iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1 / 5, random_state=random_state) clf1 = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, max_bins=37, cat_split_strategy=cat_split_strategy, aggregation=aggregation, max_features=max_features, class_weight=class_weight, random_state=random_state, dirichlet=dirichlet, step=step, ) clf1.fit(X_train, y_train) filename = "forest_classifier_on_iris.pkl" with open(filename, "wb") as f: pkl.dump(clf1, f) with open(filename, "rb") as f: clf2 = pkl.load(f) os.remove(filename) assert_forests_equal(clf1, clf2, is_classifier=True) y_pred1 = clf1.predict_proba(X_test) y_pred2 = clf2.predict_proba(X_test) np.testing.assert_equal(y_pred1, y_pred2) y_pred1 = clf1.predict(X_test) y_pred2 = clf2.predict(X_test) np.testing.assert_equal(y_pred1, y_pred2) apply1 = clf1.apply(X_test) apply2 = clf2.apply(X_test) np.testing.assert_equal(apply1, apply2)
def test_class_weight_sample_weights(self): iris = self.iris X, y = iris["data"], iris["target"] # Check that no sample_weight and all sample weights equal to 1. is the same clf1 = ForestClassifier(class_weight=None, random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) clf2.fit(X, y, sample_weight=np.ones(y.shape[0])) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) clf1 = ForestClassifier(class_weight="balanced", random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) sample_weight = compute_sample_weight("balanced", y) clf2.fit(X, y, sample_weight=sample_weight) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) # Simulate unbalanced data from the iris datasets X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0) y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report1 = classification_report(y_test, y_scores, output_dict=True) clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report2 = classification_report(y_test, y_scores, output_dict=True) # In the considered case, class_weight should improve all metrics for label in ["0", "1", "2"]: label_report1 = report1[label] label_report2 = report2[label] assert label_report2["precision"] >= label_report1["precision"] assert label_report2["recall"] >= label_report1["recall"] assert label_report2["f1-score"] >= label_report1["f1-score"] breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] idx_0 = y == 0 idx_1 = y == 1 X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0) y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) y_test_binary = LabelBinarizer().fit_transform(y_test) avg_prec1 = average_precision_score(y_test_binary, y_scores, average="weighted") clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) avg_prec2 = average_precision_score(y_test_binary, y_scores, average="weighted") assert avg_prec2 > avg_prec1
def do_test_bootstrap_again(n_estimators, n_jobs): # 4. When bootstrap seeds and column subsampling seeds are the same, # the trees are all the same clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): self._random_states_bootstrap = np.ones( (n_states or clf.n_estimators), dtype=np.int32) self._random_states_trees = np.ones( (n_states or clf.n_estimators), dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices == approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices == approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] == approx(leaves[n_estimator2]) # 5. When bootstrap seeds are the same but column subsampling seeds are # different, all the trees are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): # All bootstrap seeds are the same self._random_states_bootstrap = np.ones( (n_states or clf.n_estimators), dtype=np.int32) # But column subsampling seeds are different self._random_states_trees = np.arange(n_states or clf.n_estimators, dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices == approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices == approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] != approx(leaves[n_estimator2]) # 6. When bootstrap seeds are different but column subsampling seeds are # identical, all the trees are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): # All bootstrap seeds are the same self._random_states_bootstrap = np.arange(n_states or clf.n_estimators, dtype=np.int32) # But column subsampling seeds are different self._random_states_trees = np.ones( (n_states or clf.n_estimators, ), dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices != approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices != approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] != approx(leaves[n_estimator2])