Пример #1
0
    def percentiles_from_tdigest(self, q, *digests):
        # pylint: disable = import-outside-toplevel
        from crick import TDigest

        t = TDigest()
        t.merge(*digests)
        return np.array(t.quantile(q))
Пример #2
0
    def tdigest_chunk(self, arr):
        # pylint: disable = import-outside-toplevel
        from crick import TDigest

        t = TDigest()
        t.update(arr)
        return t
Пример #3
0
def _percentiles_from_tdigest(qs, digests):

    from crick import TDigest

    t = TDigest()
    t.merge(*digests)

    return np.array(t.quantile(qs / 100.0))
Пример #4
0
def _tdigest_chunk(a):

    from crick import TDigest

    t = TDigest()
    t.update(a)

    return t
Пример #5
0
def _percentiles_from_tdigest(qs, digests):

    from crick import TDigest

    t = TDigest()
    t.merge(*digests)

    return np.array(t.quantile(qs / 100.0))
Пример #6
0
def _tdigest_chunk(a):

    from crick import TDigest

    t = TDigest()
    t.update(a)

    return t
Пример #7
0
    def _compute_image_stats_chunked(dataset: 'DatasetReader') -> Optional[Dict[str, Any]]:
        """Compute statistics for the given rasterio dataset by looping over chunks."""
        from rasterio import features, warp, windows
        from shapely import geometry

        total_count = valid_data_count = 0
        tdigest = TDigest()
        sstats = SummaryStats()
        convex_hull = geometry.Polygon()

        block_windows = [w for _, w in dataset.block_windows(1)]

        for w in block_windows:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', message='invalid value encountered.*')
                block_data = dataset.read(1, window=w, masked=True)

            # handle NaNs for float rasters
            block_data = np.ma.masked_invalid(block_data, copy=False)

            total_count += int(block_data.size)
            valid_data = block_data.compressed()

            if valid_data.size == 0:
                continue

            valid_data_count += int(valid_data.size)

            if np.any(block_data.mask):
                hull_candidates = RasterDriver._hull_candidate_mask(~block_data.mask)
                hull_shapes = [geometry.shape(s) for s, _ in features.shapes(
                    np.ones(hull_candidates.shape, 'uint8'),
                    mask=hull_candidates,
                    transform=windows.transform(w, dataset.transform)
                )]
            else:
                w, s, e, n = windows.bounds(w, dataset.transform)
                hull_shapes = [geometry.Polygon([(w, s), (e, s), (e, n), (w, n)])]
            convex_hull = geometry.MultiPolygon([convex_hull, *hull_shapes]).convex_hull

            tdigest.update(valid_data)
            sstats.update(valid_data)

        if sstats.count() == 0:
            return None

        convex_hull_wgs = warp.transform_geom(
            dataset.crs, 'epsg:4326', geometry.mapping(convex_hull)
        )

        return {
            'valid_percentage': valid_data_count / total_count * 100,
            'range': (sstats.min(), sstats.max()),
            'mean': sstats.mean(),
            'stdev': sstats.std(),
            'percentiles': tdigest.quantile(np.arange(0.01, 1, 0.01)),
            'convex_hull': convex_hull_wgs
        }
def test_init():
    t = TDigest(500)
    assert t.compression == 500

    with pytest.raises(TypeError):
        TDigest('foo')

    for c in [np.nan, np.inf, -np.inf]:
        with pytest.raises(ValueError):
            TDigest(c)
def test_histogram_empty():
    t = TDigest()
    for b, r in [(5, None), (5, (-1, 1)), (np.arange(6), None)]:
        hist, bins = t.histogram(bins=b, range=r)
        assert len(hist) == 5
        assert len(bins) == 6
        if r is not None:
            assert bins[0] == r[0]
            assert bins[-1] == r[1]
        assert (hist == 0).all()
        assert (np.diff(bins) > 0).all()
def test_serialize():
    not_empty = TDigest()
    not_empty.update(gamma)
    empty = TDigest()
    for t in [not_empty, empty]:
        t2 = pickle.loads(pickle.dumps(t))
        assert t.compression == t2.compression
        assert (t.centroids() == t2.centroids()).all()
        np.testing.assert_equal(t.min(), t2.min())
        np.testing.assert_equal(t.max(), t2.max())
        np.testing.assert_equal(t.size(), t2.size())
def test_update_non_numeric_errors():
    data = np.array(['foo', 'bar', 'baz'])
    t = TDigest()

    with pytest.raises(TypeError):
        t.update(data)

    with pytest.raises(TypeError):
        t.update(1, data)

    with pytest.raises(TypeError):
        t.add('foo')

    with pytest.raises(TypeError):
        t.add(1, 'foo')
def test_empty():
    t = TDigest()
    assert t.size() == 0
    assert len(t.centroids()) == 0
    assert np.isnan(t.min())
    assert np.isnan(t.max())
    assert np.isnan(t.quantile(0.5))
    assert np.isnan(t.cdf(0.5))
Пример #13
0
        def __init__(self, loop=None, intervals=(5, 60, 3600)):
            self.intervals = intervals
            self.components = [TDigest() for i in self.intervals]

            self.loop = loop or IOLoop.current()
            self._pc = PeriodicCallback(self.shift, self.intervals[0] * 1000)
            self.loop.add_callback(self._pc.start)
def test_quantile_and_cdf_shape():
    t = TDigest()
    t.update(np.arange(5))

    assert isinstance(t.quantile(0.5), np.float64)
    assert isinstance(t.cdf(2), np.float64)

    res = t.quantile(())
    assert res.shape == (0,)
    res = t.cdf(())
    assert res.shape == (0,)

    qs = [np.array([0.5, 0.9]),
          np.array([[0.5, 0.9], [0, 1]]),
          np.linspace(0, 1, 100)[10:-10:2]]
    for q in qs:
        res = t.quantile(q)
        assert res.shape == q.shape
        res = t.cdf(q)
        assert res.shape == q.shape
def test_histogram_small_n():
    t = TDigest()
    t.add(1)

    hist, bins = t.histogram(10)
    assert len(hist) == 10
    assert len(bins) == 11
    assert bins[0] == 0.5
    assert bins[-1] == 1.5
    assert hist.sum() == 1

    t.add(2)
    hist, bins = t.histogram(10)
    assert hist.sum() == 2
    assert bins[0] == 1
    assert bins[-1] == 2

    hist, bins = t.histogram(range=(-5, -3))
    assert hist.sum() == 0
def test_distributions(data):
    t = TDigest()
    t.update(data)

    assert t.size() == len(data)
    assert t.min() == data.min()
    assert t.max() == data.max()

    check_valid_quantile_and_cdf(t)

    # *Quantile
    q = np.array([0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999])
    est = t.quantile(q)
    q_est = quantiles_to_q(data, est)
    np.testing.assert_allclose(q, q_est, atol=0.012, rtol=0)

    # *CDF
    x = q_to_x(data, q)
    q_est = t.cdf(x)
    np.testing.assert_allclose(q, q_est, atol=0.005)
def test_scale():
    t = TDigest()
    t.update(uniform)

    for factor in [0.5, 2]:
        t2 = t.scale(factor)
        assert t is not t2
        assert t.size() * factor == t2.size()
        assert t.min() == t2.min()
        assert t.max() == t2.max()
        a = t.centroids()
        b = t2.centroids()
        np.testing.assert_array_equal(a['mean'], b['mean'])
        np.testing.assert_allclose(a['weight'] * factor, b['weight'])

    for val in [-0.5, 0, np.nan, np.inf]:
        with pytest.raises(ValueError):
            t.scale(val)

    with pytest.raises(TypeError):
        t.scale('foobar')

    # Test scale compacts
    eps = np.finfo('f8').eps
    t = TDigest()
    t.update([1, 2, 3, 4, 5],
             [1, 1000, 1, 10000, 1])
    t2 = t.scale(eps)
    assert len(t2.centroids()) == 2

    # Compacts to 0
    t = TDigest()
    t.update([1, 2, 3, 4, 5])
    t2 = t.scale(eps)
    assert len(t2.centroids()) == 0
def test_merge():
    t = TDigest()
    t2 = TDigest()
    t3 = TDigest()
    a = np.random.uniform(0, 1, N)
    b = np.random.uniform(2, 3, N)
    data = np.concatenate([a, b])
    t2.update(a)
    t3.update(b)

    t2_centroids = t2.centroids()

    t.merge(t2, t3)
    assert t.min() == min(t2.min(), t3.min())
    assert t.max() == max(t2.max(), t3.max())
    assert t.size() == t2.size() + t3.size()
    # Check no mutation of args
    assert (t2.centroids() == t2_centroids).all()

    # *Quantile
    q = np.array([0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999])
    est = t.quantile(q)
    q_est = quantiles_to_q(data, est)
    np.testing.assert_allclose(q, q_est, atol=0.012, rtol=0)

    # *CDF
    x = q_to_x(data, q)
    q_est = t.cdf(x)
    np.testing.assert_allclose(q, q_est, atol=0.005)

    with pytest.raises(TypeError):
        t.merge(t2, 'not a tdigest')
def test_repr():
    t = TDigest(500)
    assert str(t) == "TDigest<compression=500.0, size=0.0>"
    t.update(np.arange(100))
    assert str(t) == "TDigest<compression=500.0, size=100.0>"
def test_weights():
    t = TDigest()
    t.add(1, 10)
    assert t.size() == 10

    x = np.arange(5)
    w = np.array([1, 2, 1, 2, 1])

    t = TDigest()
    t.update(x, 10)
    assert t.size() == len(x) * 10

    t = TDigest()
    t.update(x, w)
    assert t.size() == w.sum()
def test_histogram_errors():
    t = TDigest()
    t.update(np.random.uniform(1000))

    for r in [('a', 'b'), 1]:
        with pytest.raises(TypeError):
            t.histogram(range=r)

    with pytest.raises(Exception):
        t.histogram(range=1)

    for r in [(np.nan, 1), (np.inf, 1), (1, np.nan), (1, np.inf)]:
        with pytest.raises(ValueError):
            t.histogram(range=r)

    with pytest.raises(ValueError):
        t.histogram(range=(1, 0))

    for b in ['a', -1, np.arange(4).reshape((2, 2)),
              np.arange(0, 10, -1), np.array([np.nan, 0, 1])]:
        with pytest.raises(ValueError):
            t.histogram(bins=b)
def test_single():
    t = TDigest()
    t.add(10)
    assert t.min() == 10
    assert t.max() == 10
    assert t.size() == 1

    assert t.quantile(0) == 10
    assert t.quantile(0.5) == 10
    assert t.quantile(1) == 10

    assert t.cdf(9) == 0
    assert t.cdf(10) == 0.5
    assert t.cdf(11) == 1
def test_nonfinite():
    t = TDigest()
    data = gamma.copy()
    data[::10] = np.nan
    data[::7] = np.inf
    t.update(data)
    finite = data[np.isfinite(data)]
    assert t.size() == len(finite)
    assert t.min() == finite.min()
    assert t.max() == finite.max()

    t = TDigest()
    t.add(np.nan)
    t.add(np.inf)
    t.add(-np.inf)
    assert t.size() == 0

    for w in [np.inf, -np.inf, np.nan]:
        t = TDigest()
        with pytest.raises(ValueError):
            t.add(1, w)

        w = np.array([1, 2, w, 3, 4])
        t = TDigest()
        with pytest.raises(ValueError):
            t.update(np.ones(5), w)
def test_small_w():
    eps = np.finfo('f8').eps
    t = TDigest()
    t.update(gamma, eps)
    assert t.size() == 0
    assert len(t.centroids()) == 0

    t = TDigest()
    t.add(1, eps)
    assert t.size() == 0
    assert len(t.centroids()) == 0
def test_quantile_and_cdf_non_numeric():
    t = TDigest()
    t.update(np.arange(5))

    with pytest.raises(TypeError):
        t.quantile('foo')

    with pytest.raises(TypeError):
        t.update(['foo'])

    with pytest.raises(TypeError):
        t.cdf('foo')

    with pytest.raises(TypeError):
        t.cdf(['foo'])
def test_histogram():
    t = TDigest()
    data = np.random.normal(size=10000)
    t.update(data)
    hist, bins = t.histogram(100)
    assert len(hist) == 100
    assert len(bins) == 101
    c = t.cdf(bins)
    np.testing.assert_allclose((c[1:] - c[:-1]) * t.size(), hist)

    min = t.min()
    max = t.max()
    eps = np.finfo('f8').eps
    bins = np.array([min - 1, min - eps,
                     min, min + (max - min)/2, max,
                     max + eps, max + 1])
    hist, bins2 = t.histogram(bins)
    np.testing.assert_allclose(bins, bins2)
    assert hist[0] == 0
    assert hist[1] == 0
    assert hist[-2] == 0
    assert hist[-1] == 0
    assert hist.sum() == t.size()

    # range ignored when bins provided
    hist2, bins2 = t.histogram(bins, range=(-5, -3))
    np.testing.assert_allclose(hist, hist2)
    np.testing.assert_allclose(bins, bins2)