Exemplo n.º 1
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert X.shape == samp.shape

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert kde.sample().shape == (1, 1)
Exemplo n.º 2
0
def test_kde_sample_weights():
    n_samples = 400
    size_test = 20
    weights_neutral = np.full(n_samples, 3.)
    for d in [1, 2, 10]:
        rng = np.random.RandomState(0)
        X = rng.rand(n_samples, d)
        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
        X_repetitions = np.repeat(X, weights, axis=0)
        n_samples_test = size_test // d
        test_points = rng.rand(n_samples_test, d)
        for algorithm in ['auto', 'ball_tree', 'kd_tree']:
            for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']:
                if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
                    kde = KernelDensity(algorithm=algorithm, metric=metric)

                    # Test that adding a constant sample weight has no effect
                    kde.fit(X, sample_weight=weights_neutral)
                    scores_const_weight = kde.score_samples(test_points)
                    sample_const_weight = kde.sample(random_state=1234)
                    kde.fit(X)
                    scores_no_weight = kde.score_samples(test_points)
                    sample_no_weight = kde.sample(random_state=1234)
                    assert_allclose(scores_const_weight, scores_no_weight)
                    assert_allclose(sample_const_weight, sample_no_weight)

                    # Test equivalence between sampling and (integer) weights
                    kde.fit(X, sample_weight=weights)
                    scores_weight = kde.score_samples(test_points)
                    sample_weight = kde.sample(random_state=1234)
                    kde.fit(X_repetitions)
                    scores_ref_sampling = kde.score_samples(test_points)
                    sample_ref_sampling = kde.sample(random_state=1234)
                    assert_allclose(scores_weight, scores_ref_sampling)
                    assert_allclose(sample_weight, sample_ref_sampling)

                    # Test that sample weights has a non-trivial effect
                    diff = np.max(np.abs(scores_no_weight - scores_weight))
                    assert diff > 0.001

                    # Test invariance with respect to arbitrary scaling
                    scale_factor = rng.rand()
                    kde.fit(X, sample_weight=(scale_factor * weights))
                    scores_scaled_weight = kde.score_samples(test_points)
                    assert_allclose(scores_scaled_weight, scores_weight)