def test_missing(): data = [1, None, 1, 4, 6] h = StreamHist(maxbins=2) for p in data: h.update(p) assert h.missing_count == 1 assert len(h.bins) == 2 assert h.bins[0][0] == 1 and h.bins[1][0] == 5
def test_negative_densities(): points = 10000 h = StreamHist() data = make_normal(points) h.update(data) from numpy import linspace x = linspace(h.min(), h.max(), 100) assert all([h.pdf(t) >= 0. for t in x])
def test_freeze(): points = 100000 h = StreamHist(freeze=500) for p in make_normal(points): h.update(p) assert about(h.sum(0), points / 2.0, points / 50.0) assert about(h.median(), 0, 0.05) assert about(h.mean(), 0, 0.05) assert about(h.var(), 1, 0.05)
def test_string(): h = StreamHist(maxbins=5) assert str(h) == "Empty histogram" h.update(range(5)) string = "Mean\tCount\n----\t-----\n" string += "0\t1\n1\t1\n2\t1\n3\t1\n4\t1" string += "\n----\t-----\nMissing values: 0\nTotal count: 5" assert str(h) == string
def test_weighted_gap(): """ Histograms using weighted gaps are less eager to merge bins with large counts. This test builds weighted and non-weighted histograms using samples from a normal distribution. The non-weighted histogram should spend more of its bins capturing the tails of the distribution. With that in mind this test makes sure the bins bracketing the weighted histogram have larger counts than the bins bracketing the non-weighted histogram. """ points = 10000 h1 = StreamHist(maxbins=32, weighted=True) h2 = StreamHist(maxbins=32, weighted=False) for p in make_normal(points): h1.update(p) h2.update(p) wt = h1.bins nm = h2.bins assert wt[0].count + wt[-1].count > nm[0].count + nm[-1].count
def test_min_max(): h = StreamHist() assert h.min() is None assert h.max() is None for _ in range(1000): h.update(rand_int(10)) assert h.min() == 0 assert h.max() == 10 h1 = StreamHist() h2 = StreamHist() for p in range(4): h1.update(p) h2.update(p + 2) merged = h1.merge(h2) assert merged.min() == 0 assert merged.max() == 5
def test_compute_breaks(): points = 10000 bins = 25 from numpy import histogram, allclose data = make_normal(points) h1 = StreamHist().update(data) h2, es2 = histogram(data, bins=bins) h3, es3 = h1.compute_breaks(bins) assert allclose(es2, es3) assert allclose(h2, h3, rtol=1, atol=points / (bins**2))
def test_sum_first_half_of_first_bin(): # test sum at point between min and first bin value # https://github.com/carsonfarmer/streamhist/issues/13 h = StreamHist(maxbins=5) h.update((1, 2, 3, 4, 5, .5)) assert h.min() == 0.5 bin0 = h.bins[0] assert bin0.value == 0.75 assert bin0.count == 2 assert h.sum(h.min()) == 0 assert h.sum((h.min() + bin0.value) / 2) == (.5**2) * bin0.count / 2
def test_paper_example(): """Test Appendix A example from Ben-Haim paper.""" from numpy import allclose h = StreamHist(maxbins=5) h.update((23, 19, 10, 16, 36, 2, 9)) assert allclose([(bin.value, bin.count) for bin in h.bins], [(2, 1), (9.5, 2), (17.5, 2), (23, 1), (36, 1)]) h2 = StreamHist(maxbins=5) h2.update((32, 30, 45)) h3 = h + h2 assert allclose([(bin.value, bin.count) for bin in h3.bins], [(2, 1), (9.5, 2), (19.33, 3), (32.67, 3), (45, 1)], rtol=1e-3) assert about(h3.sum(15), 3.275, 1e-3)
def test_compute_breaks_with_min_max(): points = 10000 bins = 25 from numpy import histogram, allclose data = make_normal(points) h1 = StreamHist().update(data) h2, es2 = histogram(data, bins=bins, range=(-10, 10)) h3, es3 = h1.compute_breaks(bins, minimum=-10, maximum=10) assert allclose(es2, es3) assert allclose(h2, h3, rtol=1, atol=points / (bins**2))
def test_describe(): points = 10000 data = make_uniform(points) h = StreamHist().update(data) d = h.describe(quantiles=[0.5]) print(d) assert about(d["50%"], 0.5, 0.05) assert about(d["min"], 0.0, 0.05) assert about(d["max"], 1.0, 0.05) assert about(d["mean"], 0.5, 0.05) assert about(d["var"], 0.08, 0.05) assert d["count"] == points
def test_density(): h = StreamHist() for p in [1., 2., 2., 3.]: h.update(p) assert about(0.0, h.density(0.0), 1e-10) assert about(0.0, h.density(0.5), 1e-10) assert about(0.5, h.density(1.0), 1e-10) assert about(1.5, h.density(1.5), 1e-10) assert about(2.0, h.density(2.0), 1e-10) assert about(1.5, h.density(2.5), 1e-10) assert about(0.5, h.density(3.0), 1e-10) assert about(0.0, h.density(3.5), 1e-10) assert about(0.0, h.density(4.0), 1e-10)
def test_histogram_exact(): """A StreamHist which is not at capacity matches numpy statistics""" max_bins = 50 points = [random.expovariate(1 / 5) for _ in range(max_bins)] h = StreamHist(max_bins) h.update(points) q = [i / 100 for i in range(101)] import numpy as np assert h.quantiles(*q) == approx(np.quantile(points, q)) assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points)) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == max_bins
def test_merge(): assert len(StreamHist().merge(StreamHist()).bins) == 0 assert len(StreamHist().merge(StreamHist().update(1)).bins) == 1 assert len(StreamHist().update(1).merge(StreamHist()).bins) == 1 points = 1000 count = 10 hists = [] for c in range(count): h = StreamHist() for p in make_normal(points): h.update(p) hists.append(h) merged = reduce(lambda a, b: a.merge(b), hists) assert about(merged.sum(0), (points * count) / 2.0, (points * count) / 50.0) h1 = StreamHist().update(1).update(None) h2 = StreamHist().update(2).update(None) merged = h1.merge(h2) assert merged.total == 2
def test_histogram_approx(max_bins, num_points, expected_error): """Test accuracy of StreamHist over capacity, especially quantiles.""" points = [random.expovariate(1 / 5) for _ in range(num_points)] h = StreamHist(max_bins) h.update(points) import numpy as np q = [i / 100 for i in range(101)] err_sum = 0 # avg percent error across samples for p, b, b_np, b_np_min, b_np_max in zip( q, h.quantiles(*q), np.quantile(points, q), np.quantile(points, [0] * 7 + q), np.quantile(points, q[7:] + [1] * 7)): err_denom = b_np_max - b_np_min err_sum += abs(b - b_np) / err_denom assert err_sum <= expected_error assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points), rel=.05) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == num_points
def test_iris_regression(): sepal_length = [5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5.0, 5.0, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5.0, 5.5, 4.9, 4.4, 5.1, 5.0, 4.5, 4.4, 5.0, 5.1, 4.8, 5.1, 4.6, 5.3, 5.0, 7.0, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5.0, 5.9, 6.0, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7, 6.0, 5.7, 5.5, 5.5, 5.8, 6.0, 5.4, 6.0, 6.7, 6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5.0, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5, 7.7, 7.7, 6.0, 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2, 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6.0, 6.9, 6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9] h = StreamHist(maxbins=32) h.update(sepal_length) b = [{'count': 1, 'mean': 4.3}, {'count': 4, 'mean': 4.425000000000001}, {'count': 4, 'mean': 4.6}, {'count': 7, 'mean': 4.771428571428571}, {'count': 6, 'mean': 4.8999999999999995}, {'count': 10, 'mean': 5.0}, {'count': 9, 'mean': 5.1}, {'count': 4, 'mean': 5.2}, {'count': 1, 'mean': 5.3}, {'count': 6, 'mean': 5.3999999999999995}, {'count': 7, 'mean': 5.5}, {'count': 6, 'mean': 5.6000000000000005}, {'count': 15, 'mean': 5.746666666666667}, {'count': 3, 'mean': 5.900000000000001}, {'count': 6, 'mean': 6.0}, {'count': 6, 'mean': 6.1000000000000005}, {'count': 4, 'mean': 6.2}, {'count': 9, 'mean': 6.299999999999999}, {'count': 7, 'mean': 6.3999999999999995}, {'count': 5, 'mean': 6.5}, {'count': 2, 'mean': 6.6}, {'count': 8, 'mean': 6.700000000000001}, {'count': 3, 'mean': 6.8}, {'count': 4, 'mean': 6.9}, {'count': 1, 'mean': 7.0}, {'count': 1, 'mean': 7.1}, {'count': 3, 'mean': 7.2}, {'count': 1, 'mean': 7.3}, {'count': 1, 'mean': 7.4}, {'count': 1, 'mean': 7.6}, {'count': 4, 'mean': 7.7}, {'count': 1, 'mean': 7.9}] assert h.to_dict()["bins"] == b
def test_sum_edges(): h = StreamHist().update(0).update(10) assert h.sum(5) == 1 assert h.sum(0) == 0.5 assert h.sum(10) == 2
def test_point_density_at_zero(): h = StreamHist().update(-1).update(0).update(1) assert h.density(0) == 1 h = StreamHist().update(0) assert h.density(0) == float("inf")
def test_exception(): with pytest.raises(TypeError): StreamHist().sum(5) StreamHist().update(4).sum(None)
def test_negative_zero(): assert len(StreamHist().update(0.0).update(-0.0).bins) == 1
def test_weighted(): data = [1, 2, 2, 3, 4] h = StreamHist(maxbins=3, weighted=True) for p in data: h.update(p) assert h.total == len(data)
def test_update_total(): h = StreamHist(maxbins=5) h.update(range(5)) assert h.total == h.count() == 5 h.update(range(5)) assert h.total == h.count() == 10
def test_hist(): assert StreamHist() is not None
def test_mean(): points = 1001 h = StreamHist() for p in range(points): h.update(p) assert h.mean() == (points - 1) / 2.0
import random from streamhist import StreamHist def make_normal(size): return [random.normalvariate(0.0, 1.0) for _ in range(size)] points = 10000 data = make_normal(points) h1 = StreamHist(maxbins=50) h1.update(data) # Times (in seconds) # 1.421 - bins (getter/setter) # 0.955 - bins (direct access) # 0.977 - bins (slots) # 0.824 - bins (slots) w/out numpy # 0.737 - current version
def test_cdf_pdf(): points = 10000 h = StreamHist() data = make_normal(points) h.update(data) assert about(h.sum(0), points / 2.0, points / 50.0)
def test_count(): points = 15 h = StreamHist().update(make_normal(points)) assert h.count() == h.total == points
def test_round_trip(): # Tests to_dict and from_dict h = StreamHist().update([1, 1, 4]) assert h.to_dict() == h.from_dict(h.to_dict()).to_dict()