def test_mae_tree(training_data_1d): X, y = training_data_1d tree = DecisionTree(node_class=MedianNode, min_samples_leaf=2) tree.fit(X, y, splitter=splitters.MAESplitter()) preds = tree.predict(X) if isinstance(X, pd.DataFrame): assert list(preds.index) == list(X.index) if isinstance(y, pd.DataFrame): assert list(preds.columns) == list(y.columns) preds = preds[0].tolist() assert preds == pytest.approx( [15.1, 15.1, 15.1, 25.2, 25.2, 25.2, 9.8, 9.8, 9.8, 17.0, 17.0, 17.0], abs=1e-1) # 4 leaves are formed: node_ids = tree.apply(X) assert len(set(node_ids)) == 4 for group_id in range(4): begin_idx = 3 * group_id end_idx = 3 * group_id + 2 assert len(set(node_ids[begin_idx:end_idx])) == 1 #TODO: write a test where training X is a dataframe, and the predicting X has a different column order.
def test_min_samples_leaf_affects_mae_split(training_data_1d): X, Y = training_data_1d training_data = TrainingData(X, Y) mae_splitter = splitters.MAESplitter() coeffs, cutpoint, cost = mae_splitter.select_feature_to_cut( training_data.X, training_data.Y, len(training_data.X) / 2) index = np.argwhere(coeffs != 0.0)[0][0] assert training_data.X_names[index] in {'x', 0} assert cutpoint == 5.5
def test_mae_splitter(training_data_1d): X, Y = training_data_1d training_data = TrainingData(X, Y) mae_splitter = splitters.MAESplitter() coeffs, cutpoint, cost = mae_splitter.select_feature_to_cut( training_data.X, training_data.Y, 2) index = np.argwhere(coeffs != 0.0)[0][0] assert training_data.X_names[index] in {'x', 0} # Brute force calculation reveals 8.5 results in best MAE reduction: assert cutpoint == 8.5
import pytest import pandas as pd from pyboretum import ( MeanNode, MedianNode, splitters, # MAE and MSE splitters DecisionTree, ) @pytest.mark.parametrize('splitter, node_class', [ (splitters.MSESplitter(), MeanNode), (splitters.MAESplitter(), MedianNode), ]) def test_with_one_sample_nodes(splitter, node_class, training_data_1d): X, y = training_data_1d tree = DecisionTree(node_class=node_class, min_samples_leaf=1) tree.fit(X, y, splitter=splitter) # All y should form their own leaves: preds = tree.predict(X) if isinstance(X, pd.DataFrame): assert list(preds.index) == list(X.index) if isinstance(y, pd.DataFrame): assert list(preds.columns) == list(y.columns) preds = preds[0].tolist() assert preds == pytest.approx(y.tolist(), abs=1e-1)
import pytest from pyboretum import ( splitters, TrainingData, ) @pytest.mark.parametrize('splitter', [ splitters.MSESplitter(), splitters.MAESplitter(), ]) def test_min_samples_leaf_can_stop_splitting(splitter, training_data_1d): X, Y = training_data_1d training_data = TrainingData(X, Y) feature, cutpoint, cost = splitter.select_feature_to_cut( training_data.X, training_data.Y, len(training_data.X)) assert feature == None assert cutpoint == None assert cost == float('inf')