def is_parallel_split_faster(n_estimators, aggregation): clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=1, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_no_parallel = toc - tic clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=effective_n_jobs, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_parallel = toc - tic # We want parallel code to just be faster than non parallel one # assert time_no_parallel >= effective_n_jobs * time_parallel / 4 assert time_no_parallel > time_parallel
def test_nodes_on_classification_datasets( data_loader, n_estimators, aggregation, class_weight, n_jobs, max_features, random_state, dirichlet, step, multiclass, cat_split_strategy, criterion, ): X, y = data_loader(raw=True) clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, cat_split_strategy=cat_split_strategy, aggregation=aggregation, criterion=criterion, max_features=max_features, class_weight=class_weight, random_state=random_state, dirichlet=dirichlet, step=step, ) clf.fit(X, y) for tree in clf.trees: node_count = tree._tree_classifier.node_count nodes = tree._tree_classifier.nodes[:node_count] bin_partitions = tree._tree_classifier.bin_partitions assert tree._tree_classifier.nodes.size >= node_count check_nodes(nodes, bin_partitions, aggregation)
def test_dirichlet_switch(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf1 = ForestClassifier(class_weight="balanced", random_state=42) clf2 = ForestClassifier(class_weight="balanced", random_state=42, dirichlet=2.0) clf1.fit(X_train, y_train) clf2.fit(X_train, y_train) y_score1 = clf1.predict_proba(X_test) y_score2 = clf2.predict_proba(X_test) assert np.max(np.abs(y_score1 - y_score2)) >= 0.01 clf2.dirichlet = 0.5 y_score2 = clf2.predict_proba(X_test) assert y_score1 == pytest.approx(y_score2, abs=1e-5) clf1.dirichlet = 1.1 clf2.dirichlet = 1.1 y_score1 = clf1.predict_proba(X_test) y_score2 = clf2.predict_proba(X_test) assert y_score1 == pytest.approx(y_score2, abs=1e-5)
def is_parallel_split_faster(n_estimators, aggregation): clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=1, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_no_parallel = toc - tic clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=effective_n_jobs, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_parallel = toc - tic # We want parallel code to be effective_n_jobs / 3 faster when using # effectively effective_n_jobs threads assert time_no_parallel >= effective_n_jobs * time_parallel / 3 print("time_no_parallel:", time_no_parallel) print("time_parallel:", time_parallel)
def test_several_max_bins_for_classification(loader, is_categorical_, required_log_loss, max_bins, aggregation): X, y = loader(raw=True) random_state = 42 X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) n_estimators = 10 class_weight = "balanced" n_jobs = -1 dirichlet = 1e-2 clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, aggregation=aggregation, max_bins=max_bins, dirichlet=dirichlet, class_weight=class_weight, random_state=random_state, ) clf.fit(X_train, y_train) np.testing.assert_equal(clf.is_categorical_, is_categorical_) y_scores_test = clf.predict_proba(X_test) assert log_loss(y_test, y_scores_test) < required_log_loss
def test_aggregation_dirichlet(self): iris = self.iris X, y = iris["data"], iris["target"] clf = ForestClassifier(dirichlet=0.0, aggregation=True) with pytest.raises( ValueError, match="dirichlet must be > 0 when aggregation=True"): clf.fit(X, y)
def test_cat_split_strategy_on_car(self): dataset = load_car() dataset.one_hot_encode = False dataset.test_size = 1.0 / 5 random_state = 42 X_train, X_test, y_train, y_test = dataset.extract( random_state=random_state) n_estimators = 1 aggregation = False class_weight = "balanced" n_jobs = 1 max_features = None random_state = 42 dirichlet = 0.0 categorical_features = dataset.categorical_features_ multiclass = "multinomial" cat_split_strategy = "binary" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, cat_split_strategy=cat_split_strategy, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) lloss_train_binary = log_loss(y_train, y_scores_train) lloss_test_binary = log_loss(y_test, y_scores_test) multiclass = "multinomial" cat_split_strategy = "all" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, cat_split_strategy=cat_split_strategy, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) lloss_train_all = log_loss(y_train, y_scores_train) lloss_test_all = log_loss(y_test, y_scores_test) assert lloss_train_all < lloss_train_binary assert lloss_test_all < lloss_test_binary
def test_n_classes_classes_n_features_n_samples(self): y = np.array(["one", "two", "three", "one", "one", "two"]) X = np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) clf = ForestClassifier() clf.fit(X, y) assert tuple(clf.classes_) == ("one", "three", "two") assert clf.n_classes_ == 3 assert clf.n_features_in_ == 2 assert clf.n_samples_in_ == 6
def do_test_bootstrap(n_estimators, n_jobs, random_state): # 1. Test that all bootstrap samples are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf.fit(X, y) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices != approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices != approx( clf.trees[n_estimator2]._valid_indices) # 2. Test that random_seed makes bootstrap samples identical and that # when no random_seed is used bootstrap samples are different clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) for n_estimator in range(n_estimators): if random_state is None: assert clf1.trees[n_estimator]._train_indices != approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices != approx( clf2.trees[n_estimator]._valid_indices) else: assert clf1.trees[n_estimator]._train_indices == approx( clf2.trees[n_estimator]._train_indices) assert clf1.trees[n_estimator]._valid_indices == approx( clf2.trees[n_estimator]._valid_indices) # 3. Test that the apply() method gives the exact same leaves (this allows # to check that the trees are the same, namely that random columns # subsampling is indeed correctly seeded) and that predictions are the # same (or not) clf1 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf1.fit(X, y) clf2 = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=random_state) clf2.fit(X, y) if random_state is None: assert clf1.apply(X) != approx(clf2.apply(X)) assert clf1.predict_proba(X) != approx(clf2.predict_proba(X)) else: assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
def test_n_features_(self): clf = ForestClassifier(n_estimators=2) with pytest.raises( ValueError, match="You must call fit before asking for n_features_"): _ = clf.n_features_ np.random.seed(42) X = np.random.randn(10, 3) y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) clf.fit(X, y) assert clf.n_features_ == 3
def test_performance_breast_cancer(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(class_weight="balanced", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(class_weight="balanced", random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 clf = ForestClassifier(random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
def test_performance_iris(self): iris = self.iris X, y = iris["data"], iris["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(class_weight="balanced", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(class_weight="balanced", random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(random_state=42, criterion="entropy") clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
def test_parallel_fit(self): n_samples = 100_000 X, y = make_moons(n_samples=n_samples, noise=0.2, random_state=42) # Precompile clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=True) clf.fit(X[:10], y[:10]) clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=False) clf.fit(X[:10], y[:10]) random_state = 42 effective_n_jobs = self.effective_n_jobs print("effective_n_jobs: ", effective_n_jobs) def is_parallel_split_faster(n_estimators, aggregation): clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=1, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_no_parallel = toc - tic clf = ForestClassifier( random_state=random_state, n_estimators=n_estimators, n_jobs=effective_n_jobs, aggregation=aggregation, ) tic = time() clf.fit(X, y) toc = time() time_parallel = toc - tic # We want parallel code to be effective_n_jobs / 3 faster when using # effectively effective_n_jobs threads assert time_no_parallel >= effective_n_jobs * time_parallel / 3 print("time_no_parallel:", time_no_parallel) print("time_parallel:", time_parallel) # We want each thread to handle 4 trees n_estimators = 4 * effective_n_jobs is_parallel_split_faster(n_estimators=n_estimators, aggregation=True) is_parallel_split_faster(n_estimators=n_estimators, aggregation=False)
def test_ovr_with_two_classes(self): """Test on a binary classification problem that 'ovr' and 'multiclass' are exactly identical""" dataset = self.adult dataset.one_hot_encode = False random_state = 42 X_train, X_test, y_train, y_test = dataset.extract( random_state=random_state) n_estimators = 2 aggregation = False class_weight = "balanced" n_jobs = -1 max_features = None dirichlet = 0.0 categorical_features = dataset.categorical_features_ multiclass = "multinomial" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_test1 = clf.predict_proba(X_test) multiclass = "ovr" clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, class_weight=class_weight, categorical_features=categorical_features, random_state=random_state, dirichlet=dirichlet, ) clf.fit(X_train, y_train) y_scores_test2 = clf.predict_proba(X_test) assert y_scores_test1 == approx(y_scores_test2)
def test_performance_cat_split_strategy_iris(self): iris = self.iris X, y = iris["data"], iris["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(cat_split_strategy="all", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985 clf = ForestClassifier(cat_split_strategy="random", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
def test_performance_cat_split_strategy_breast_cancer(self): breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42, test_size=0.3) clf = ForestClassifier(cat_split_strategy="all", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 # all# clf = ForestClassifier(cat_split_strategy="random", random_state=42) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98 # random#
def test_min_samples_split_min_samples_leaf_on_adult( aggregation, max_features, random_state, min_samples_split, min_samples_leaf, criterion, ): X, y = load_adult(raw=True) n_estimators = 3 n_jobs = -1 class_weight = "balanced" multiclass = "multinomial" step = 1.0 clf = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, aggregation=aggregation, max_features=max_features, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, class_weight=class_weight, random_state=random_state, step=step, ) clf.fit(X, y) min_samples = min(min_samples_split, min_samples_leaf) for tree in clf.trees: node_count = tree._tree_classifier.node_count nodes = tree._tree_classifier.nodes[:node_count] for node_id, node in enumerate(nodes): # Check that nodes respect the min_samples_split and # min_samples_leaf constraints assert node["n_samples_train"] >= min_samples if aggregation: assert node["n_samples_valid"] >= min_samples
def do_test_bootstrap_again(n_estimators, n_jobs): # 4. When bootstrap seeds and column subsampling seeds are the same, # the trees are all the same clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): self._random_states_bootstrap = np.ones( (n_states or clf.n_estimators), dtype=np.int32) self._random_states_trees = np.ones( (n_states or clf.n_estimators), dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices == approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices == approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] == approx(leaves[n_estimator2]) # 5. When bootstrap seeds are the same but column subsampling seeds are # different, all the trees are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): # All bootstrap seeds are the same self._random_states_bootstrap = np.ones( (n_states or clf.n_estimators), dtype=np.int32) # But column subsampling seeds are different self._random_states_trees = np.arange(n_states or clf.n_estimators, dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices == approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices == approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] != approx(leaves[n_estimator2]) # 6. When bootstrap seeds are different but column subsampling seeds are # identical, all the trees are different clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs) def _my_generate_random_states(self, n_states=None): # All bootstrap seeds are the same self._random_states_bootstrap = np.arange(n_states or clf.n_estimators, dtype=np.int32) # But column subsampling seeds are different self._random_states_trees = np.ones( (n_states or clf.n_estimators, ), dtype=np.int32) # Monkey patch the classifier clf._generate_random_states = types.MethodType( _my_generate_random_states, clf) clf.fit(X, y) leaves = clf.apply(X) for n_estimator1, n_estimator2 in product(range(n_estimators), range(n_estimators)): if n_estimator1 < n_estimator2: assert clf.trees[n_estimator1]._train_indices != approx( clf.trees[n_estimator2]._train_indices) assert clf.trees[n_estimator1]._valid_indices != approx( clf.trees[n_estimator2]._valid_indices) assert leaves[n_estimator1] != approx(leaves[n_estimator2])
clf1 = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, cat_split_strategy=cat_split_strategy, aggregation=aggregation, max_features=max_features, class_weight=class_weight, random_state=random_state, dirichlet=dirichlet, step=step, ) clf1.fit(X_train, y_train) filename = "forest_classifier_on_iris.pkl" with open(filename, "wb") as f: pkl.dump(clf1, f) with open(filename, "rb") as f: clf2 = pkl.load(f) # os.remove(filename) # # assert_forests_equal(clf1, clf2, is_classifier=True) # # y_pred1 = clf1.predict_proba(X_test)
def test_class_weight_sample_weights(self): iris = self.iris X, y = iris["data"], iris["target"] # Check that no sample_weight and all sample weights equal to 1. is the same clf1 = ForestClassifier(class_weight=None, random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) clf2.fit(X, y, sample_weight=np.ones(y.shape[0])) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) clf1 = ForestClassifier(class_weight="balanced", random_state=42) clf1.fit(X, y) clf2 = ForestClassifier(class_weight=None, random_state=42) sample_weight = compute_sample_weight("balanced", y) clf2.fit(X, y, sample_weight=sample_weight) assert clf1.apply(X) == approx(clf2.apply(X)) assert clf1.predict_proba(X) == approx(clf2.predict_proba(X)) # Simulate unbalanced data from the iris datasets X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0) y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report1 = classification_report(y_test, y_scores, output_dict=True) clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) report2 = classification_report(y_test, y_scores, output_dict=True) # In the considered case, class_weight should improve all metrics for label in ["0", "1", "2"]: label_report1 = report1[label] label_report2 = report2[label] assert label_report2["precision"] >= label_report1["precision"] assert label_report2["recall"] >= label_report1["recall"] assert label_report2["f1-score"] >= label_report1["f1-score"] breast_cancer = self.breast_cancer X, y = breast_cancer["data"], breast_cancer["target"] idx_0 = y == 0 idx_1 = y == 1 X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0) y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0) X_train, X_test, y_train, y_test = train_test_split(X_unb, y_unb, shuffle=True, stratify=y_unb, random_state=42, test_size=0.5) clf = ForestClassifier(class_weight=None, random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) y_test_binary = LabelBinarizer().fit_transform(y_test) avg_prec1 = average_precision_score(y_test_binary, y_scores, average="weighted") clf = ForestClassifier(class_weight="balanced", random_state=42, aggregation=True) clf.fit(X_train, y_train) y_scores = clf.predict(X_test) avg_prec2 = average_precision_score(y_test_binary, y_scores, average="weighted") assert avg_prec2 > avg_prec1
def test_forest_classifier_serialization( dataset_name, n_estimators, aggregation, class_weight, dirichlet, n_jobs, max_features, random_state, step, multiclass, cat_split_strategy, ): if dataset_name == "adult": X, y = load_adult(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) elif dataset_name == "iris": iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1 / 5, random_state=random_state) clf1 = ForestClassifier( n_estimators=n_estimators, n_jobs=n_jobs, multiclass=multiclass, max_bins=37, cat_split_strategy=cat_split_strategy, aggregation=aggregation, max_features=max_features, class_weight=class_weight, random_state=random_state, dirichlet=dirichlet, step=step, ) clf1.fit(X_train, y_train) filename = "forest_classifier_on_iris.pkl" with open(filename, "wb") as f: pkl.dump(clf1, f) with open(filename, "rb") as f: clf2 = pkl.load(f) os.remove(filename) assert_forests_equal(clf1, clf2, is_classifier=True) y_pred1 = clf1.predict_proba(X_test) y_pred2 = clf2.predict_proba(X_test) np.testing.assert_equal(y_pred1, y_pred2) y_pred1 = clf1.predict(X_test) y_pred2 = clf2.predict(X_test) np.testing.assert_equal(y_pred1, y_pred2) apply1 = clf1.apply(X_test) apply2 = clf2.apply(X_test) np.testing.assert_equal(apply1, apply2)
y0="y", x1="x0", y1="y0", line_color="#151515", line_alpha=0.4, source=source_tree, ) tooltips = [(attribute, "@" + attribute) for attribute in attributes] tree_hover = HoverTool(renderers=[circles], tooltips=tooltips) fig.add_tools(tree_hover) fig.text(x="x", y="y", text="node_id", source=source_tree) return fig if __name__ == "__main__": X = np.repeat(np.arange(5), 20).reshape((-1, 1)) y = np.repeat([1, 0, 0, 1, 0], 20) clf = ForestClassifier(n_estimators=1, random_state=42, categorical_features=[True], dirichlet=0.0) # X_onehot = OneHotEncoder(sparse=False).fit_transform(X) clf.fit(X, y) df = clf.get_nodes(0) print(df) fig = plot_tree(clf, 0) show(fig)