示例#1
0
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
    assert max_bins_large >= max_bins_small
    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
    mapper_small = _BinMapper(n_bins=max_bins_small + 1)
    mapper_large = _BinMapper(n_bins=max_bins_small + 1)
    binned_small = mapper_small.fit_transform(data)
    binned_large = mapper_large.fit_transform(binned_small)
    assert_array_equal(binned_small, binned_large)
示例#2
0
def test_subsample():
    # Make sure bin thresholds are different when applying subsampling
    mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
    mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)

    for feature in range(DATA.shape[1]):
        assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature],
                               mapper_subsample.bin_thresholds_[feature],
                               rtol=1e-4)
示例#3
0
def test_boston_dataset(n_bins):
    X, y = load_boston(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(G_H_DTYPE)
    hessians = np.ones(1, dtype=G_H_DTYPE)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes,
                        n_bins=n_bins,
                        n_bins_non_missing=mapper.n_bins_non_missing_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
示例#4
0
def test_bin_mapper_random_data(max_bins):
    n_samples, n_features = DATA.shape

    expected_count_per_bin = n_samples // max_bins
    tol = int(0.05 * expected_count_per_bin)

    # max_bins is the number of bins for non-missing values
    n_bins = max_bins + 1
    mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
    binned = mapper.transform(DATA)

    assert binned.shape == (n_samples, n_features)
    assert binned.dtype == np.uint8
    assert_array_equal(binned.min(axis=0), np.array([0, 0]))
    assert_array_equal(binned.max(axis=0),
                       np.array([max_bins - 1, max_bins - 1]))
    assert len(mapper.bin_thresholds_) == n_features
    for bin_thresholds_feature in mapper.bin_thresholds_:
        assert bin_thresholds_feature.shape == (max_bins - 1, )
        assert bin_thresholds_feature.dtype == DATA.dtype
    assert np.all(mapper.n_bins_non_missing_ == max_bins)

    # Check that the binned data is approximately balanced across bins.
    for feature_idx in range(n_features):
        for bin_idx in range(max_bins):
            count = (binned[:, feature_idx] == bin_idx).sum()
            assert abs(count - expected_count_per_bin) < tol
示例#5
0
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
                          constant_hessian, noise):
    rng = np.random.RandomState(seed=0)
    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    if noise:
        y_scale = y.std()
        y += rng.normal(scale=noise, size=n_samples) * y_scale
    mapper = _BinMapper(n_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    shape_hessian = 1 if constant_hessian else all_gradients.shape
    all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
    grower = TreeGrower(X, all_gradients, all_hessians,
                        n_bins=n_bins, shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    predictor = grower.make_predictor(
        bin_thresholds=mapper.bin_thresholds_)

    if n_samples >= min_samples_leaf:
        for node in predictor.nodes:
            if node['is_leaf']:
                assert node['count'] >= min_samples_leaf
    else:
        assert predictor.nodes.shape[0] == 1
        assert predictor.nodes[0]['is_leaf']
        assert predictor.nodes[0]['count'] == n_samples
示例#6
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure mrex has the same predictions as lightgbm for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and mrex should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 255

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_mrex = HistGradientBoostingRegressor(max_iter=max_iter,
                                             max_bins=max_bins,
                                             learning_rate=1,
                                             n_iter_no_change=None,
                                             min_samples_leaf=min_samples_leaf,
                                             max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_mrex, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_mrex.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_mrex = est_mrex.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lightgbm - pred_mrex) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_mrex = est_mrex.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lightgbm - pred_mrex) > 1e-4) < .01
示例#7
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 255

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_mrex = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_mrex, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_mrex.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_mrex = est_mrex.predict(X_train)
    assert np.mean(pred_mrex == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_mrex = accuracy_score(y_train, pred_mrex)
    np.testing.assert_almost_equal(acc_lightgbm, acc_mrex)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_mrex = est_mrex.predict(X_test)
        assert np.mean(pred_mrex == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_mrex = accuracy_score(y_test, pred_mrex)
        np.testing.assert_almost_equal(acc_lightgbm, acc_mrex, decimal=2)
示例#8
0
def test_n_bins_non_missing(n_bins, diff):
    # Check that n_bins_non_missing is n_unique_values when
    # there are not a lot of unique values, else n_bins - 1.

    n_unique_values = n_bins + diff
    X = list(range(n_unique_values)) * 2
    X = np.array(X).reshape(-1, 1)
    mapper = _BinMapper(n_bins=n_bins).fit(X)
    assert np.all(mapper.n_bins_non_missing_ == min(n_bins -
                                                    1, n_unique_values))
示例#9
0
def test_infinite_values():
    # Make sure infinite values are properly handled.
    bin_mapper = _BinMapper()

    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)

    bin_mapper.fit(X)
    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
    assert bin_mapper.n_bins_non_missing_ == [4]

    expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
    assert_array_equal(bin_mapper.transform(X), expected_binned_X)
示例#10
0
def test_bin_mapper_small_random_data(n_samples, max_bins):
    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
    assert len(np.unique(data)) == n_samples

    # max_bins is the number of bins for non-missing values
    n_bins = max_bins + 1
    mapper = _BinMapper(n_bins=n_bins, random_state=42)
    binned = mapper.fit_transform(data)

    assert binned.shape == data.shape
    assert binned.dtype == np.uint8
    assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
                       np.arange(n_samples))
示例#11
0
def test_bin_mapper_repeated_values_invariance(n_distinct):
    rng = np.random.RandomState(42)
    distinct_values = rng.normal(size=n_distinct)
    assert len(np.unique(distinct_values)) == n_distinct

    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
    data = distinct_values[repeated_indices]
    rng.shuffle(data)
    assert_array_equal(np.unique(data), np.sort(distinct_values))

    data = data.reshape(-1, 1)

    mapper_1 = _BinMapper(n_bins=n_distinct + 1)
    binned_1 = mapper_1.fit_transform(data)
    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))

    # Adding more bins to the mapper yields the same results (same thresholds)
    mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
    binned_2 = mapper_2.fit_transform(data)

    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
    assert_array_equal(binned_1, binned_2)
示例#12
0
def test_max_depth(max_depth):
    # Make sure max_depth parameter works as expected
    rng = np.random.RandomState(seed=0)

    n_bins = 256
    n_samples = 1000

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = _BinMapper(n_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
    grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
    grower.grow()

    depth = max(leaf.depth for leaf in grower.finalized_leaves)
    assert depth == max_depth
示例#13
0
def test_split_on_nan_with_infinite_values():
    # Make sure the split on nan situations are respected even when there are
    # samples with +inf values (we set the threshold to +inf when we have a
    # split on nan so this test makes sure this does not introduce edge-case
    # bugs). We need to use the private API so that we can also test
    # predict_binned().

    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
    # the gradient values will force a split on nan situation
    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
    hessians = np.ones(shape=1, dtype=G_H_DTYPE)

    bin_mapper = _BinMapper()
    X_binned = bin_mapper.fit_transform(X)

    n_bins_non_missing = 3
    has_missing_values = True
    grower = TreeGrower(X_binned, gradients, hessians,
                        n_bins_non_missing=n_bins_non_missing,
                        has_missing_values=has_missing_values,
                        min_samples_leaf=1)

    grower.grow()

    predictor = grower.make_predictor(
        bin_thresholds=bin_mapper.bin_thresholds_
    )

    # sanity check: this was a split on nan
    assert predictor.nodes[0]['threshold'] == np.inf
    assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1

    # Make sure in particular that the +inf sample is mapped to the left child
    # Note that lightgbm "fails" here and will assign the inf sample to the
    # right child, even though it's a "split on nan" situation.
    predictions = predictor.predict(X)
    predictions_binned = predictor.predict_binned(
        X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
    assert np.all(predictions == -gradients)
    assert np.all(predictions_binned == -gradients)
示例#14
0
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
    # check for missing values: make sure nans are mapped to the last bin
    # and that the _BinMapper attributes are correct

    X = [[1, 1, 0], [np.NaN, np.NaN, 0], [2, 1, 0], [np.NaN, 2, 1], [3, 2, 1],
         [4, 1, 0]]

    X = np.array(X)

    mapper = _BinMapper(n_bins=n_bins)
    mapper.fit(X)

    assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)

    for feature_idx in range(X.shape[1]):
        assert len(mapper.bin_thresholds_[feature_idx]) == \
            n_bins_non_missing[feature_idx] - 1

    assert mapper.missing_values_bin_idx_ == n_bins - 1

    X_trans = mapper.transform(X)
    assert_array_equal(X_trans, X_trans_expected)
示例#15
0
def test_binning_train_validation_are_separated():
    # Make sure training and validation data are binned separately.
    # See issue 13926

    rng = np.random.RandomState(0)
    validation_fraction = .2
    gb = HistGradientBoostingClassifier(
        n_iter_no_change=5,
        validation_fraction=validation_fraction,
        random_state=rng
    )
    gb.fit(X_classification, y_classification)
    mapper_training_data = gb.bin_mapper_

    # Note that since the data is small there is no subsampling and the
    # random_state doesn't matter
    mapper_whole_data = _BinMapper(random_state=0)
    mapper_whole_data.fit(X_classification)

    n_samples = X_classification.shape[0]
    assert np.all(mapper_training_data.n_bins_non_missing_ ==
                  int((1 - validation_fraction) * n_samples))
    assert np.all(mapper_training_data.n_bins_non_missing_ !=
                  mapper_whole_data.n_bins_non_missing_)
示例#16
0
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
    # Make sure root node isn't split if n_samples is not at least twice
    # min_samples_leaf
    rng = np.random.RandomState(seed=0)

    n_bins = 256

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = _BinMapper(n_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(G_H_DTYPE)
    all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
    grower = TreeGrower(X, all_gradients, all_hessians,
                        n_bins=n_bins, shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    if n_samples >= min_samples_leaf * 2:
        assert len(grower.finalized_leaves) >= 2
    else:
        assert len(grower.finalized_leaves) == 1
示例#17
0
def test_invalid_n_bins(n_bins):
    err_msg = ('n_bins={} should be no smaller than 3 and no larger than 256'.
               format(n_bins))
    with pytest.raises(ValueError, match=err_msg):
        _BinMapper(n_bins=n_bins).fit(DATA)
示例#18
0
def test_bin_mapper_n_features_transform():
    mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
    err_msg = 'This estimator was fitted with 2 features but 4 got passed'
    with pytest.raises(ValueError, match=err_msg):
        mapper.transform(np.repeat(DATA, 2, axis=1))
示例#19
0
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
    data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
    # max_bins is the number of bins for non-missing values
    n_bins = max_bins + 1
    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
    assert_array_equal(data, binned)
示例#20
0
def test_bin_mapper_identity_small(max_bins, scale, offset):
    data = np.arange(max_bins).reshape(-1, 1) * scale + offset
    # max_bins is the number of bins for non-missing values
    n_bins = max_bins + 1
    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
    assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))