def test_regression(): random.seed(1700) data = make_normal(10000) hist1 = StreamHist(maxbins=5) hist2 = StreamHist(maxbins=5, weighted=True) # hist3 = StreamHist(maxbins=5, weighted=True) hist4 = StreamHist(maxbins=5) hist1.update(data) hist2.update(data) hist3 = hist2 + hist1 hist4.update(range(10000)) reg = [{'count': 1176.0, 'mean': -1.622498097884402}, {'count': 5290.0, 'mean': -0.3390892100898127}, {'count': 3497.0, 'mean': 1.0310297400593385}, {'count': 35.0, 'mean': 2.2157182954841126}, {'count': 2.0, 'mean': 3.563619987633774}] assert hist1.to_dict()["bins"] == reg reg = [-1.022649473089556, -0.5279748744244142, 0.1476067074922296, 0.9815338358189885, 1.6627248917927795] assert hist1.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 579.0, 'mean': -2.017257931684027}, {'count': 1902.0, 'mean': -1.0677091300958608}, {'count': 3061.0, 'mean': -0.24660751313691653}, {'count': 2986.0, 'mean': 0.5523120572161528}, {'count': 1472.0, 'mean': 1.557598912751095}] assert hist2.to_dict()["bins"] == reg reg = [-1.1941285587341846, -0.6041467139342105, 0.08840996549170466, 0.8247014091807423, 1.557598912751095] assert hist2.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 1755.0, 'mean': -1.7527351028815432}, {'count': 1902.0, 'mean': -1.0677091300958608}, {'count': 8351.0, 'mean': -0.3051906980106826}, {'count': 6483.0, 'mean': 0.8105375295133331}, {'count': 1509.0, 'mean': 1.5755221868037264}] assert hist3.to_dict()["bins"] == reg reg = [-1.0074328972882012, -0.5037558708214145, 0.11958766584785563, 0.8874923692642509, 1.432517386448461] assert hist3.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg reg = [{'count': 1339.0, 'mean': 669.0}, {'count': 2673.0, 'mean': 2675.0}, {'count': 1338.0, 'mean': 4680.5}, {'count': 2672.0, 'mean': 6685.5}, {'count': 1978.0, 'mean': 9010.5}] assert hist4.to_dict()["bins"] == reg reg = [1830.581598358843, 3063.70150218845, 5831.110283907479, 8084.851093080222, 9010.5] assert hist4.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg
def test_quantiles(): points = 10000 h = StreamHist() for p in make_uniform(points): h.update(p) assert about(h.quantiles(0.5)[0], 0.5, 0.05) h = StreamHist() for p in make_normal(points): h.update(p) a, b, c = h.quantiles(0.25, 0.5, 0.75) assert about(a, -0.66, 0.05) assert about(b, 0.00, 0.05) assert about(c, 0.66, 0.05)
def test_multi_merge(): points = 100000 data = make_uniform(points) samples = [data[x:x + 100] for x in range(0, len(data), 100)] hists = [StreamHist().update(s) for s in samples] h1 = sum(hists) h2 = StreamHist().update(data) q1 = h1.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9) q2 = h2.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9) from numpy import allclose assert allclose(q1, q2, rtol=1, atol=0.025)
def test_multi_merge(): points = 100000 data = make_uniform(points) samples = [data[x:x+100] for x in range(0, len(data), 100)] hists = [StreamHist().update(s) for s in samples] h1 = sum(hists) h2 = StreamHist().update(data) q1 = h1.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9) q2 = h2.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9) from numpy import allclose assert allclose(q1, q2, rtol=1, atol=0.025)
def test_histogram_exact(): """A StreamHist which is not at capacity matches numpy statistics""" max_bins = 50 points = [random.expovariate(1 / 5) for _ in range(max_bins)] h = StreamHist(max_bins) h.update(points) q = [i / 100 for i in range(101)] import numpy as np assert h.quantiles(*q) == approx(np.quantile(points, q)) assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points)) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == max_bins
def test_histogram_approx(max_bins, num_points, expected_error): """Test accuracy of StreamHist over capacity, especially quantiles.""" points = [random.expovariate(1 / 5) for _ in range(num_points)] h = StreamHist(max_bins) h.update(points) import numpy as np q = [i / 100 for i in range(101)] err_sum = 0 # avg percent error across samples for p, b, b_np, b_np_min, b_np_max in zip( q, h.quantiles(*q), np.quantile(points, q), np.quantile(points, [0] * 7 + q), np.quantile(points, q[7:] + [1] * 7)): err_denom = b_np_max - b_np_min err_sum += abs(b - b_np) / err_denom assert err_sum <= expected_error assert h.mean() == approx(np.mean(points)) assert h.var() == approx(np.var(points), rel=.05) assert h.min() == min(points) assert h.max() == max(points) assert h.count() == num_points