def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) bin_thresholds = _find_binning_thresholds(data, max_bins=10) assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) bin_thresholds = _find_binning_thresholds(data, max_bins=5) assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
def test_find_binning_thresholds_low_n_bins(): bin_thresholds = _find_binning_thresholds(DATA, max_bins=128, random_state=0) assert len(bin_thresholds) == 2 for i in range(len(bin_thresholds)): assert bin_thresholds[i].shape == (127,) # 128 - 1 assert bin_thresholds[i].dtype == DATA.dtype
def test_find_binning_thresholds_random_data(): bin_thresholds = _find_binning_thresholds(DATA, random_state=0) assert len(bin_thresholds) == 2 for i in range(len(bin_thresholds)): assert bin_thresholds[i].shape == (255,) # 256 - 1 assert bin_thresholds[i].dtype == DATA.dtype assert_allclose(bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1) assert_allclose(bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2)
def test_map_to_bins(n_bins): bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, random_state=0) binned = _map_to_bins(DATA, bin_thresholds) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous min_indices = DATA.argmin(axis=0) max_indices = DATA.argmax(axis=0) for feature_idx, min_idx in enumerate(min_indices): assert binned[min_idx, feature_idx] == 0 for feature_idx, max_idx in enumerate(max_indices): assert binned[max_idx, feature_idx] == n_bins - 1
def test_find_binning_thresholds_invalid_n_bins(): with pytest.raises(ValueError): _find_binning_thresholds(DATA, max_bins=1024)
from joblib import Memory from pygbm.binning import _find_binning_thresholds, _map_to_bins m = Memory(location='/tmp') @m.cache def make_data(n_samples=int(1e6), n_features=5, seed=42, dtype=np.float32): rng = np.random.RandomState(seed) return rng.randn(n_samples, n_features).astype(dtype) print("Generating random data...") data = make_data(n_samples=int(1e6), n_features=5, seed=42, dtype=np.float32) print("Extracting bins from subsample of data...") bins = _find_binning_thresholds(data, random_state=0) print("Compiling map_to_bins...") tic = time() binned = _map_to_bins(np.asfortranarray(data[:5]), bins) toc = time() duration = toc - tic print(f"done in {duration:0.3f}s") print("Mapping data to integer bins...") tic = time() binned = _map_to_bins(data, bins) toc = time() duration = toc - tic print(f"Processed {data.nbytes/1e9:0.3f} GB in {duration:0.3f}s" f" ({data.nbytes / 1e6 / duration:0.1f} MB/s)")