def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor( bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_missing_value_predict_only(): # Make sure that missing values are supported at predict time even if they # were not encountered in the training data: the missing values are # assigned to whichever child has the most samples. rng = np.random.RandomState(0) n_samples = 100 X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) X_binned = np.asfortranarray(X_binned) gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False) grower.grow() predictor = grower.make_predictor() # go from root to a leaf, always following node with the most samples. # That's the path nans are supposed to take node = predictor.nodes[0] while not node['is_leaf']: left = predictor.nodes[node['left']] right = predictor.nodes[node['right']] node = left if left['count'] > right['count'] else right prediction_main_path = node['value'] # now build X_test with only nans, and make sure all predictions are equal # to prediction_main_path all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) assert np.all(predictor.predict(all_nans) == prediction_main_path)
def test_init_parameters_validation(): X_binned, all_gradients, all_hessians = _make_training_data() with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"): TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1) with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"): TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1)
def test_input_validation(): X_binned, all_gradients, all_hessians = _make_training_data() X_binned_float = X_binned.astype(np.float32) with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"): TreeGrower(X_binned_float, all_gradients, all_hessians) X_binned_C_array = np.ascontiguousarray(X_binned) with pytest.raises( ValueError, match="X_binned should be passed as Fortran contiguous array"): TreeGrower(X_binned_C_array, all_gradients, all_hessians)
def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) n_bins = 256 n_samples = 1000 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth) grower.grow() depth = max(leaf.depth for leaf in grower.finalized_leaves) assert depth == max_depth
def test_split_on_nan_with_infinite_values(): # Make sure the split on nan situations are respected even when there are # samples with +inf values (we set the threshold to +inf when we have a # split on nan so this test makes sure this does not introduce edge-case # bugs). We need to use the private API so that we can also test # predict_binned(). X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) # the gradient values will force a split on nan situation gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) bin_mapper = _BinMapper() X_binned = bin_mapper.fit_transform(X) n_bins_non_missing = 3 has_missing_values = True grower = TreeGrower(X_binned, gradients, hessians, n_bins_non_missing=n_bins_non_missing, has_missing_values=has_missing_values, min_samples_leaf=1) grower.grow() predictor = grower.make_predictor( bin_thresholds=bin_mapper.bin_thresholds_ ) # sanity check: this was a split on nan assert predictor.nodes[0]['threshold'] == np.inf assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1 # Make sure in particular that the +inf sample is mapped to the left child # Note that lightgbm "fails" here and will assign the inf sample to the # right child, even though it's a "split on nan" situation. predictions = predictor.predict(X) predictions_binned = predictor.predict_binned( X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_) assert np.all(predictions == -gradients) assert np.all(predictions_binned == -gradients)
def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 X_binned, all_gradients, all_hessians = _make_training_data( n_bins=n_bins) grower = TreeGrower(X_binned, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., max_leaf_nodes=3, min_samples_leaf=5) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) # Check that the node structure can be converted into a predictor # object to perform predictions at scale predictor = grower.make_predictor() assert predictor.nodes.shape[0] == 5 assert predictor.nodes['is_leaf'].sum() == 3 # Probe some predictions for each leaf of the tree # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array([ [0, 0], [42, 99], [128, 254], [129, 0], [129, 85], [254, 85], [129, 86], [129, 254], [242, 100], ], dtype=np.uint8) missing_values_bin_idx = n_bins - 1 predictions = predictor.predict_binned(input_data, missing_values_bin_idx) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: predictions = predictor.predict_binned(X_binned, missing_values_bin_idx) assert np.allclose(predictions, -all_gradients)
def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) n_bins = 256 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): X_binned, all_gradients, all_hessians = _make_training_data( n_bins=n_bins, constant_hessian=constant_hessian) n_samples = X_binned.shape[0] if stopping_param == "max_leaf_nodes": stopping_param = {"max_leaf_nodes": 3} else: stopping_param = {"min_gain_to_split": 0.01} grower = TreeGrower(X_binned, all_gradients, all_hessians, n_bins=n_bins, shrinkage=shrinkage, min_samples_leaf=1, **stopping_param) # The root node is not yet splitted, but the best possible split has # already been evaluated: assert grower.root.left_child is None assert grower.root.right_child is None root_split = grower.root.split_info assert root_split.feature_idx == 0 assert root_split.bin_idx == n_bins // 2 assert len(grower.splittable_nodes) == 1 # Calling split next applies the next split and computes the best split # for each of the two newly introduced children nodes. left_node, right_node = grower.split_next() # All training samples have ben splitted in the two nodes, approximately # 50%/50% _check_children_consistency(grower.root, left_node, right_node) assert len(left_node.sample_indices) > 0.4 * n_samples assert len(left_node.sample_indices) < 0.6 * n_samples if grower.min_gain_to_split > 0: # The left node is too pure: there is no gain to split it further. assert left_node.split_info.gain < grower.min_gain_to_split assert left_node in grower.finalized_leaves # The right node can still be splitted further, this time on feature #1 split_info = right_node.split_info assert split_info.gain > 1. assert split_info.feature_idx == 1 assert split_info.bin_idx == n_bins // 3 assert right_node.left_child is None assert right_node.right_child is None # The right split has not been applied yet. Let's do it now: assert len(grower.splittable_nodes) == 1 right_left_node, right_right_node = grower.split_next() _check_children_consistency(right_node, right_left_node, right_right_node) assert len(right_left_node.sample_indices) > 0.1 * n_samples assert len(right_left_node.sample_indices) < 0.2 * n_samples assert len(right_right_node.sample_indices) > 0.2 * n_samples assert len(right_right_node.sample_indices) < 0.4 * n_samples # All the leafs are pure, it is not possible to split any further: assert not grower.splittable_nodes # Check the values of the leaves: assert grower.root.left_child.value == approx(shrinkage) assert grower.root.right_child.left_child.value == approx(shrinkage) assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3)