def test_boston_dataset(): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=42) mapper = BinMapper(random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) gradients = y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.75 assert r2_score(y_test, predictor.predict(X_test)) > 0.65
def test_boston_dataset(max_bins): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) mapper = BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.numerical_thresholds_) == n_features for i in range(len(mapper.numerical_thresholds_)): assert mapper.numerical_thresholds_[i].shape == (n_bins - 1, ) assert mapper.numerical_thresholds_[i].dtype == DATA.dtype assert np.all(mapper.n_bins_per_feature_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol