def test_counts(): data = [605, 760, 610, 615, 605, 780, 605, 905] h = StreamHist(maxbins=4, weighted=False) for p in data: h.update(p) counts = [b[1] for b in h.bins] assert len(data) == reduce(operator.add, counts) == h.total
def test_warm_start_with_history(): normal_data = np.random.normal(0, 10, 10) h1 = StreamHist(maxbins=8) h1.update(normal_data) d = h1.to_dict() h2 = StreamHist.from_dict(d) assert str(h2) == str(h1)
def test_len(): h = StreamHist(maxbins=5) assert len(h) == 0 h.update(range(5)) assert len(h) == len(h.bins) == 5 h.update(range(5)) assert len(h) == len(h.bins) == 5
def test_bounds(): points = range(15) h = StreamHist(maxbins=8) h.update(points) assert h.bounds() == (0, 14) h = StreamHist() assert h.bounds() == (None, None)
def test_missing(): data = [1, None, 1, 4, 6] h = StreamHist(maxbins=2) for p in data: h.update(p) assert h.missing_count == 1 assert len(h.bins) == 2 assert h.bins[0][0] == 1 and h.bins[1][0] == 5
def test_string(): h = StreamHist(maxbins=5) assert str(h) == "Empty histogram" h.update(range(5)) string = "Mean\tCount\n----\t-----\n" string += "0\t1\n1\t1\n2\t1\n3\t1\n4\t1" string += "\n----\t-----\nMissing values: 0\nTotal count: 5" assert str(h) == string
def test_negative_densities(): points = 10000 h = StreamHist() data = make_normal(points) h.update(data) from numpy import linspace x = linspace(h.min(), h.max(), 100) assert all([h.pdf(t) >= 0. for t in x])
def test_freeze(): points = 100000 h = StreamHist(freeze=500) for p in make_normal(points): h.update(p) assert about(h.sum(0), points / 2.0, points / 50.0) assert about(h.median(), 0, 0.05) assert about(h.mean(), 0, 0.05) assert about(h.var(), 1, 0.05)
def test_freeze(): points = 100000 h = StreamHist(freeze=500) for p in make_normal(points): h.update(p) assert about(h.sum(0), points/2.0, points/50.0) assert about(h.median(), 0, 0.05) assert about(h.mean(), 0, 0.05) assert about(h.var(), 1, 0.05)
def test_var(): assert StreamHist().update(1).var() is None h = StreamHist() for p in [1, 1, 2, 3, 4, 5, 6, 6]: h.update(p) assert h.var() == 3.75 h = StreamHist() for p in make_normal(10000): h.update(p) assert about(h.var(), 1, 0.05)
def test_exact_median(): points = range(15) # Odd number of points h = StreamHist(maxbins=17) h.update(points) assert h.median() == 7 points = range(16) # Even number of points h = StreamHist(maxbins=17) h.update(points) assert h.median() == 7.5
def test_copy(): h1 = StreamHist() h2 = h1.copy() assert h1.bins == h2.bins h1.update(make_normal(1000)) assert h1.bins != h2.bins h2 = h1.copy() assert h1.bins == h2.bins h1 = StreamHist().update([p for p in range(4)]) h2 = h1.copy() assert h1.to_dict() == h2.to_dict()
def test_sum_first_half_of_first_bin(): # test sum at point between min and first bin value # https://github.com/carsonfarmer/streamhist/issues/13 h = StreamHist(maxbins=5) h.update((1, 2, 3, 4, 5, .5)) assert h.min() == 0.5 bin0 = h.bins[0] assert bin0.value == 0.75 assert bin0.count == 2 assert h.sum(h.min()) == 0 assert h.sum((h.min() + bin0.value) / 2) == (.5**2) * bin0.count / 2
def test_trim(): points = 1000 h = StreamHist(maxbins=10) for _ in range(points): h.update(rand_int(10)) assert len(h.bins) == 10 and h.total == points h = StreamHist(maxbins=10) for _ in range(points): h.insert(rand_int(10), 1) h.trim() assert len(h.bins) == 10 and h.total == points
def test_update_vs_insert(): points = 1000 data = make_normal(points) h1 = StreamHist(maxbins=50) h1.update(data) h2 = StreamHist(maxbins=50) for i, p in enumerate(data): h2.insert(p, 1) h2.trim() h2.trim() assert h1.to_dict() == h2.to_dict()
def test_median_mean(): points = 10000 h = StreamHist() for p in make_uniform(points): h.update(p) assert about(h.median(), 0.5, 0.05) h = StreamHist() for p in make_normal(points): h.update(p) assert about(h.median(), 0, 0.05) assert about(h.mean(), 0, 0.05)
def test_density(): h = StreamHist() for p in [1., 2., 2., 3.]: h.update(p) assert about(0.0, h.density(0.0), 1e-10) assert about(0.0, h.density(0.5), 1e-10) assert about(0.5, h.density(1.0), 1e-10) assert about(1.5, h.density(1.5), 1e-10) assert about(2.0, h.density(2.0), 1e-10) assert about(1.5, h.density(2.5), 1e-10) assert about(0.5, h.density(3.0), 1e-10) assert about(0.0, h.density(3.5), 1e-10) assert about(0.0, h.density(4.0), 1e-10)
def test_quantiles(): points = 10000 h = StreamHist() for p in make_uniform(points): h.update(p) assert about(h.quantiles(0.5)[0], 0.5, 0.05) h = StreamHist() for p in make_normal(points): h.update(p) a, b, c = h.quantiles(0.25, 0.5, 0.75) assert about(a, -0.66, 0.05) assert about(b, 0.00, 0.05) assert about(c, 0.66, 0.05)
def test_histogram_exact(): """A StreamHist which is not at capacity matches numpy statistics""" max_bins = 50 points = [random.expovariate(1 / 5) for _ in range(max_bins)] h = StreamHist(max_bins) h.update(points) q = [i / 100 for i in range(101)] import numpy as np assert h.quantiles(*q) == approx(np.quantile(points, q)) assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points)) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == max_bins
def test_paper_example(): """Test Appendix A example from Ben-Haim paper.""" from numpy import allclose h = StreamHist(maxbins=5) h.update((23,19,10,16,36,2,9)) assert allclose( [(bin.value, bin.count) for bin in h.bins], [(2,1), (9.5,2), (17.5,2), (23,1), (36,1)]) h2 = StreamHist(maxbins=5) h2.update((32,30,45)) h3 = h + h2 assert allclose( [(bin.value, bin.count) for bin in h3.bins], [(2,1), (9.5,2), (19.33,3), (32.67,3), (45,1)], rtol=1e-3) assert about(h3.sum(15), 3.275, 1e-3)
def test_weighted_gap(): """ Histograms using weighted gaps are less eager to merge bins with large counts. This test builds weighted and non-weighted histograms using samples from a normal distribution. The non-weighted histogram should spend more of its bins capturing the tails of the distribution. With that in mind this test makes sure the bins bracketing the weighted histogram have larger counts than the bins bracketing the non-weighted histogram. """ points = 10000 h1 = StreamHist(maxbins=32, weighted=True) h2 = StreamHist(maxbins=32, weighted=False) for p in make_normal(points): h1.update(p) h2.update(p) wt = h1.bins nm = h2.bins assert wt[0].count + wt[-1].count > nm[0].count + nm[-1].count
def test_merge(): assert len(StreamHist().merge(StreamHist()).bins) == 0 assert len(StreamHist().merge(StreamHist().update(1)).bins) == 1 assert len(StreamHist().update(1).merge(StreamHist()).bins) == 1 points = 1000 count = 10 hists = [] for c in range(count): h = StreamHist() for p in make_normal(points): h.update(p) hists.append(h) merged = reduce(lambda a, b: a.merge(b), hists) assert about(merged.sum(0), (points*count)/2.0, (points*count)/50.0) h1 = StreamHist().update(1).update(None) h2 = StreamHist().update(2).update(None) merged = h1.merge(h2) assert merged.total == 2
def test_paper_example(): """Test Appendix A example from Ben-Haim paper.""" from numpy import allclose h = StreamHist(maxbins=5) h.update((23, 19, 10, 16, 36, 2, 9)) assert allclose([(bin.value, bin.count) for bin in h.bins], [(2, 1), (9.5, 2), (17.5, 2), (23, 1), (36, 1)]) h2 = StreamHist(maxbins=5) h2.update((32, 30, 45)) h3 = h + h2 assert allclose([(bin.value, bin.count) for bin in h3.bins], [(2, 1), (9.5, 2), (19.33, 3), (32.67, 3), (45, 1)], rtol=1e-3) assert about(h3.sum(15), 3.275, 1e-3)
def test_merge(): assert len(StreamHist().merge(StreamHist()).bins) == 0 assert len(StreamHist().merge(StreamHist().update(1)).bins) == 1 assert len(StreamHist().update(1).merge(StreamHist()).bins) == 1 points = 1000 count = 10 hists = [] for c in range(count): h = StreamHist() for p in make_normal(points): h.update(p) hists.append(h) merged = reduce(lambda a, b: a.merge(b), hists) assert about(merged.sum(0), (points * count) / 2.0, (points * count) / 50.0) h1 = StreamHist().update(1).update(None) h2 = StreamHist().update(2).update(None) merged = h1.merge(h2) assert merged.total == 2
def test_histogram_approx(max_bins, num_points, expected_error): """Test accuracy of StreamHist over capacity, especially quantiles.""" points = [random.expovariate(1 / 5) for _ in range(num_points)] h = StreamHist(max_bins) h.update(points) import numpy as np q = [i / 100 for i in range(101)] err_sum = 0 # avg percent error across samples for p, b, b_np, b_np_min, b_np_max in zip( q, h.quantiles(*q), np.quantile(points, q), np.quantile(points, [0] * 7 + q), np.quantile(points, q[7:] + [1] * 7)): err_denom = b_np_max - b_np_min err_sum += abs(b - b_np) / err_denom assert err_sum <= expected_error assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points), rel=.05) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == num_points
def test_iris_regression(): sepal_length = [5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5.0, 5.0, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5.0, 5.5, 4.9, 4.4, 5.1, 5.0, 4.5, 4.4, 5.0, 5.1, 4.8, 5.1, 4.6, 5.3, 5.0, 7.0, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5.0, 5.9, 6.0, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7, 6.0, 5.7, 5.5, 5.5, 5.8, 6.0, 5.4, 6.0, 6.7, 6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5.0, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5, 7.7, 7.7, 6.0, 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2, 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6.0, 6.9, 6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9] h = StreamHist(maxbins=32) h.update(sepal_length) b = [{'count': 1, 'mean': 4.3}, {'count': 4, 'mean': 4.425000000000001}, {'count': 4, 'mean': 4.6}, {'count': 7, 'mean': 4.771428571428571}, {'count': 6, 'mean': 4.8999999999999995}, {'count': 10, 'mean': 5.0}, {'count': 9, 'mean': 5.1}, {'count': 4, 'mean': 5.2}, {'count': 1, 'mean': 5.3}, {'count': 6, 'mean': 5.3999999999999995}, {'count': 7, 'mean': 5.5}, {'count': 6, 'mean': 5.6000000000000005}, {'count': 15, 'mean': 5.746666666666667}, {'count': 3, 'mean': 5.900000000000001}, {'count': 6, 'mean': 6.0}, {'count': 6, 'mean': 6.1000000000000005}, {'count': 4, 'mean': 6.2}, {'count': 9, 'mean': 6.299999999999999}, {'count': 7, 'mean': 6.3999999999999995}, {'count': 5, 'mean': 6.5}, {'count': 2, 'mean': 6.6}, {'count': 8, 'mean': 6.700000000000001}, {'count': 3, 'mean': 6.8}, {'count': 4, 'mean': 6.9}, {'count': 1, 'mean': 7.0}, {'count': 1, 'mean': 7.1}, {'count': 3, 'mean': 7.2}, {'count': 1, 'mean': 7.3}, {'count': 1, 'mean': 7.4}, {'count': 1, 'mean': 7.6}, {'count': 4, 'mean': 7.7}, {'count': 1, 'mean': 7.9}] assert h.to_dict()["bins"] == b
def test_regression(): random.seed(1700) data = make_normal(10000) hist1 = StreamHist(maxbins=5) hist2 = StreamHist(maxbins=5, weighted=True) # hist3 = StreamHist(maxbins=5, weighted=True) hist4 = StreamHist(maxbins=5) hist1.update(data) hist2.update(data) hist3 = hist2 + hist1 hist4.update(range(10000)) reg = [{'count': 1176.0, 'mean': -1.622498097884402}, {'count': 5290.0, 'mean': -0.3390892100898127}, {'count': 3497.0, 'mean': 1.0310297400593385}, {'count': 35.0, 'mean': 2.2157182954841126}, {'count': 2.0, 'mean': 3.563619987633774}] assert hist1.to_dict()["bins"] == reg reg = [-1.022649473089556, -0.5279748744244142, 0.1476067074922296, 0.9815338358189885, 1.6627248917927795] assert hist1.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 579.0, 'mean': -2.017257931684027}, {'count': 1902.0, 'mean': -1.0677091300958608}, {'count': 3061.0, 'mean': -0.24660751313691653}, {'count': 2986.0, 'mean': 0.5523120572161528}, {'count': 1472.0, 'mean': 1.557598912751095}] assert hist2.to_dict()["bins"] == reg reg = [-1.1941285587341846, -0.6041467139342105, 0.08840996549170466, 0.8247014091807423, 1.557598912751095] assert hist2.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 1755.0, 'mean': -1.7527351028815432}, {'count': 1902.0, 'mean': -1.0677091300958608}, {'count': 8351.0, 'mean': -0.3051906980106826}, {'count': 6483.0, 'mean': 0.8105375295133331}, {'count': 1509.0, 'mean': 1.5755221868037264}] assert hist3.to_dict()["bins"] == reg reg = [-1.0074328972882012, -0.5037558708214145, 0.11958766584785563, 0.8874923692642509, 1.432517386448461] assert hist3.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 1339.0, 'mean': 669.0}, {'count': 2673.0, 'mean': 2675.0}, {'count': 1338.0, 'mean': 4680.5}, {'count': 2672.0, 'mean': 6685.5}, {'count': 1978.0, 'mean': 9010.5}] assert hist4.to_dict()["bins"] == reg reg = [1830.581598358843, 3063.70150218845, 5831.110283907479, 8084.851093080222, 9010.5] assert hist4.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg
def test_min_max(): h = StreamHist() assert h.min() is None assert h.max() is None for _ in range(1000): h.update(rand_int(10)) assert h.min() == 0 assert h.max() == 10 h1 = StreamHist() h2 = StreamHist() for p in range(4): h1.update(p) h2.update(p + 2) merged = h1.merge(h2) assert merged.min() == 0 assert merged.max() == 5
def test_min_max(): h = StreamHist() assert h.min() is None assert h.max() is None for _ in range(1000): h.update(rand_int(10)) assert h.min() == 0 assert h.max() == 10 h1 = StreamHist() h2 = StreamHist() for p in range(4): h1.update(p) h2.update(p+2) merged = h1.merge(h2) assert merged.min() == 0 assert merged.max() == 5
def test_weighted(): data = [1, 2, 2, 3, 4] h = StreamHist(maxbins=3, weighted=True) for p in data: h.update(p) assert h.total == len(data)
import random from streamhist import StreamHist def make_normal(size): return [random.normalvariate(0.0, 1.0) for _ in range(size)] points = 10000 data = make_normal(points) h1 = StreamHist(maxbins=50) h1.update(data) # Times (in seconds) # 1.421 - bins (getter/setter) # 0.955 - bins (direct access) # 0.977 - bins (slots) # 0.824 - bins (slots) w/out numpy # 0.737 - current version
def test_cdf_pdf(): points = 10000 h = StreamHist() data = make_normal(points) h.update(data) assert about(h.sum(0), points / 2.0, points / 50.0)
def test_cdf_pdf(): points = 10000 h = StreamHist() data = make_normal(points) h.update(data) assert about(h.sum(0), points/2.0, points/50.0)
def test_mean(): points = 1001 h = StreamHist() for p in range(points): h.update(p) assert h.mean() == (points - 1) / 2.0
def test_update_total(): h = StreamHist(maxbins=5) h.update(range(5)) assert h.total == h.count() == 5 h.update(range(5)) assert h.total == h.count() == 10
def test_mean(): points = 1001 h = StreamHist() for p in range(points): h.update(p) assert h.mean() == (points-1)/2.0