def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target): # Make sure that native categorical splits are equivalent to using a OHE, # when given enough depth rng = np.random.RandomState(0) n_samples = 10_000 X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8) X_ohe = OneHotEncoder(sparse=False).fit_transform(X_binned) X_ohe = np.asfortranarray(X_ohe).astype(np.uint8) if target == "equal": gradients = X_binned.reshape(-1) elif target == "binary": gradients = (X_binned % 2).reshape(-1) else: gradients = rng.randn(n_samples) gradients = gradients.astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower_params = { "min_samples_leaf": min_samples_leaf, "max_depth": None, "max_leaf_nodes": None, } grower = TreeGrower(X_binned, gradients, hessians, is_categorical=[True], **grower_params) grower.grow() # we pass undefined bin_thresholds because we won't use predict() predictor = grower.make_predictor( binning_thresholds=np.zeros((1, n_unique_categories))) preds = predictor.predict_binned(X_binned, missing_values_bin_idx=255, n_threads=n_threads) grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params) grower_ohe.grow() predictor_ohe = grower_ohe.make_predictor( binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories))) preds_ohe = predictor_ohe.predict_binned(X_ohe, missing_values_bin_idx=255, n_threads=n_threads) assert predictor.get_max_depth() <= predictor_ohe.get_max_depth() if target == "binary" and n_unique_categories > 2: # OHE needs more splits to achieve the same predictions assert predictor.get_max_depth() < predictor_ohe.get_max_depth() np.testing.assert_allclose(preds, preds_ohe)
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = _BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_boston_dataset(max_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, actual_n_bins=mapper.actual_n_bins_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_regression_dataset(n_bins): X, y = make_regression(n_samples=500, n_features=10, n_informative=5, random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 10 max_leaf_nodes = 30 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor( binning_thresholds=mapper.bin_thresholds_) known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) f_idx_map = np.zeros(0, dtype=np.uint32) y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map) assert r2_score(y_train, y_pred_train) > 0.82 y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map) assert r2_score(y_test, y_pred_test) > 0.67
def test_missing_value_predict_only(): # Make sure that missing values are supported at predict time even if they # were not encountered in the training data: the missing values are # assigned to whichever child has the most samples. rng = np.random.RandomState(0) n_samples = 100 X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) X_binned = np.asfortranarray(X_binned) gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False) grower.grow() predictor = grower.make_predictor() # go from root to a leaf, always following node with the most samples. # That's the path nans are supposed to take node = predictor.nodes[0] while not node['is_leaf']: left = predictor.nodes[node['left']] right = predictor.nodes[node['right']] node = left if left['count'] > right['count'] else right prediction_main_path = node['value'] # now build X_test with only nans, and make sure all predictions are equal # to prediction_main_path all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) assert np.all(predictor.predict(all_nans) == prediction_main_path)
def test_regression_dataset(n_bins): X, y = make_regression(n_samples=500, n_features=10, n_informative=5, random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 10 max_leaf_nodes = 30 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(num_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.82 assert r2_score(y_test, predictor.predict(X_test)) > 0.67
def test_nodes_values(monotonic_cst, seed): # Build a single tree with only one feature, and make sure the nodes # values respect the monotonic constraints. # Considering the following tree with a monotonic POS constraint, we # should have: # # root # / \ # 5 10 # middle = 7.5 # / \ / \ # a b c d # # a <= b and c <= d (assert_children_values_monotonic) # a, b <= middle <= c, d (assert_children_values_bounded) # a <= b <= c <= d (assert_leaves_values_monotonic) # # The last one is a consequence of the others, but can't hurt to check rng = np.random.RandomState(seed) n_samples = 1000 n_features = 1 X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8) X_binned = np.asfortranarray(X_binned) gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=.1) grower.grow() # grow() will shrink the leaves values at the very end. For our comparison # tests, we need to revert the shrinkage of the leaves, else we would # compare the value of a leaf (shrunk) with a node (not shrunk) and the # test would not be correct. for leave in grower.finalized_leaves: leave.value /= grower.shrinkage # We pass undefined binning_thresholds because we won't use predict anyway predictor = grower.make_predictor( binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))) # The consistency of the bounds can only be checked on the tree grower # as the node bounds are not copied into the predictor tree. The # consistency checks on the values of node children and leaves can be # done either on the grower tree or on the predictor tree. We only # do those checks on the predictor tree as the latter is derived from # the former. assert_children_values_monotonic(predictor, monotonic_cst) assert_children_values_bounded(grower, monotonic_cst) assert_leaves_values_monotonic(predictor, monotonic_cst)
def test_grow_tree_categories(): # Check that the grower produces the right predictor tree when a split is # categorical X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T X_binned = np.asfortranarray(X_binned) all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE) all_hessians = np.ones(1, dtype=G_H_DTYPE) is_categorical = np.ones(1, dtype=np.uint8) grower = TreeGrower(X_binned, all_gradients, all_hessians, n_bins=4, shrinkage=1.0, min_samples_leaf=1, is_categorical=is_categorical) grower.grow() assert grower.n_nodes == 3 categories = [np.array([4, 9], dtype=X_DTYPE)] predictor = grower.make_predictor(binning_thresholds=categories) root = predictor.nodes[0] assert root['count'] == 23 assert root['depth'] == 0 assert root['is_categorical'] left, right = predictor.nodes[root['left']], predictor.nodes[root['right']] # arbitrary validation, but this means ones go to the left. assert left['count'] >= right['count'] # check binned category value (1) expected_binned_cat_bitset = [2**1] + [0] * 7 binned_cat_bitset = predictor.binned_left_cat_bitsets assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset) # check raw category value (9) expected_raw_cat_bitsets = [2**9] + [0] * 7 raw_cat_bitsets = predictor.raw_left_cat_bitsets assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets) # Note that since there was no missing values during training, the missing # values aren't part of the bitsets. However, we expect the missing values # to go to the biggest child (i.e. the left one). # The left child has a value of -1 = negative gradient. assert root['missing_go_to_left'] # make sure binned missing values are mapped to the left child during # prediction prediction_binned = predictor.predict_binned( np.asarray([[6]]).astype(X_BINNED_DTYPE), missing_values_bin_idx=6) assert_allclose(prediction_binned, [-1]) # negative gradient # make sure raw missing values are mapped to the left child during # prediction known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32) # ignored anyway f_idx_map = np.array([0], dtype=np.uint32) prediction = predictor.predict(np.array([[np.nan]]), known_cat_bitsets, f_idx_map) assert_allclose(prediction, [-1])
def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins) grower = TreeGrower( X_binned, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1.0, max_leaf_nodes=3, min_samples_leaf=5, ) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) # Check that the node structure can be converted into a predictor # object to perform predictions at scale # We pass undefined binning_thresholds because we won't use predict anyway predictor = grower.make_predictor( binning_thresholds=np.zeros((X_binned.shape[1], n_bins)) ) assert predictor.nodes.shape[0] == 5 assert predictor.nodes["is_leaf"].sum() == 3 # Probe some predictions for each leaf of the tree # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array( [ [0, 0], [42, 99], [128, 254], [129, 0], [129, 85], [254, 85], [129, 86], [129, 254], [242, 100], ], dtype=np.uint8, ) missing_values_bin_idx = n_bins - 1 predictions = predictor.predict_binned( input_data, missing_values_bin_idx, n_threads ) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads) assert np.allclose(predictions, -all_gradients)
def test_split_on_nan_with_infinite_values(): # Make sure the split on nan situations are respected even when there are # samples with +inf values (we set the threshold to +inf when we have a # split on nan so this test makes sure this does not introduce edge-case # bugs). We need to use the private API so that we can also test # predict_binned(). X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) # the gradient values will force a split on nan situation gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) bin_mapper = _BinMapper() X_binned = bin_mapper.fit_transform(X) n_bins_non_missing = 3 has_missing_values = True grower = TreeGrower( X_binned, gradients, hessians, n_bins_non_missing=n_bins_non_missing, has_missing_values=has_missing_values, min_samples_leaf=1, n_threads=n_threads, ) grower.grow() predictor = grower.make_predictor( binning_thresholds=bin_mapper.bin_thresholds_) # sanity check: this was a split on nan assert predictor.nodes[0]["num_threshold"] == np.inf assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1 known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets() # Make sure in particular that the +inf sample is mapped to the left child # Note that lightgbm "fails" here and will assign the inf sample to the # right child, even though it's a "split on nan" situation. predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads) predictions_binned = predictor.predict_binned( X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_, n_threads=n_threads, ) np.testing.assert_allclose(predictions, -gradients) np.testing.assert_allclose(predictions_binned, -gradients)
def test_missing_value_predict_only(): # Make sure that missing values are supported at predict time even if they # were not encountered in the training data: the missing values are # assigned to whichever child has the most samples. rng = np.random.RandomState(0) n_samples = 100 X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) X_binned = np.asfortranarray(X_binned) gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False) grower.grow() # We pass undefined binning_thresholds because we won't use predict anyway predictor = grower.make_predictor( binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))) # go from root to a leaf, always following node with the most samples. # That's the path nans are supposed to take node = predictor.nodes[0] while not node["is_leaf"]: left = predictor.nodes[node["left"]] right = predictor.nodes[node["right"]] node = left if left["count"] > right["count"] else right prediction_main_path = node["value"] # now build X_test with only nans, and make sure all predictions are equal # to prediction_main_path all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) f_idx_map = np.zeros(0, dtype=np.uint32) y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads) assert np.all(y_pred == prediction_main_path)
def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins) grower = TreeGrower(X_binned, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., max_leaf_nodes=3, min_samples_leaf=5) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) # Check that the node structure can be converted into a predictor # object to perform predictions at scale predictor = grower.make_predictor() assert predictor.nodes.shape[0] == 5 assert predictor.nodes['is_leaf'].sum() == 3 # Probe some predictions for each leaf of the tree # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array([ [0, 0], [42, 99], [128, 255], [129, 0], [129, 85], [255, 85], [129, 86], [129, 255], [242, 100], ], dtype=np.uint8) predictions = predictor.predict_binned(input_data) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: predictions = predictor.predict_binned(X_binned) assert np.allclose(predictions, -all_gradients)