예제 #1
0
def test_regression():
    random.seed(1700)
    data = make_normal(10000)
    hist1 = StreamHist(maxbins=5)
    hist2 = StreamHist(maxbins=5, weighted=True)
    # hist3 = StreamHist(maxbins=5, weighted=True)
    hist4 = StreamHist(maxbins=5)

    hist1.update(data)
    hist2.update(data)
    hist3 = hist2 + hist1
    hist4.update(range(10000))

    reg = [{'count': 1176.0, 'mean': -1.622498097884402},
           {'count': 5290.0, 'mean': -0.3390892100898127},
           {'count': 3497.0, 'mean': 1.0310297400593385},
           {'count': 35.0, 'mean': 2.2157182954841126},
           {'count': 2.0, 'mean': 3.563619987633774}]
    assert hist1.to_dict()["bins"] == reg

    reg = [-1.022649473089556, -0.5279748744244142, 0.1476067074922296,
           0.9815338358189885, 1.6627248917927795]
    assert hist1.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg

    reg = [{'count': 579.0, 'mean': -2.017257931684027},
           {'count': 1902.0, 'mean': -1.0677091300958608},
           {'count': 3061.0, 'mean': -0.24660751313691653},
           {'count': 2986.0, 'mean': 0.5523120572161528},
           {'count': 1472.0, 'mean': 1.557598912751095}]
    assert hist2.to_dict()["bins"] == reg

    reg = [-1.1941285587341846, -0.6041467139342105, 0.08840996549170466,
           0.8247014091807423, 1.557598912751095]
    assert hist2.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg

    reg = [{'count': 1755.0, 'mean': -1.7527351028815432},
           {'count': 1902.0, 'mean': -1.0677091300958608},
           {'count': 8351.0, 'mean': -0.3051906980106826},
           {'count': 6483.0, 'mean': 0.8105375295133331},
           {'count': 1509.0, 'mean': 1.5755221868037264}]
    assert hist3.to_dict()["bins"] == reg

    reg = [-1.0074328972882012, -0.5037558708214145, 0.11958766584785563,
           0.8874923692642509, 1.432517386448461]
    assert hist3.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg

    reg = [{'count': 1339.0, 'mean': 669.0},
           {'count': 2673.0, 'mean': 2675.0},
           {'count': 1338.0, 'mean': 4680.5},
           {'count': 2672.0, 'mean': 6685.5},
           {'count': 1978.0, 'mean': 9010.5}]
    assert hist4.to_dict()["bins"] == reg

    reg = [1830.581598358843, 3063.70150218845, 5831.110283907479,
           8084.851093080222, 9010.5]
    assert hist4.quantiles(0.1, 0.25, 0.5, 0.75, 0.9) == reg
예제 #2
0
def test_quantiles():
    points = 10000
    h = StreamHist()
    for p in make_uniform(points):
        h.update(p)
    assert about(h.quantiles(0.5)[0], 0.5, 0.05)

    h = StreamHist()
    for p in make_normal(points):
        h.update(p)
    a, b, c = h.quantiles(0.25, 0.5, 0.75)
    assert about(a, -0.66, 0.05)
    assert about(b, 0.00, 0.05)
    assert about(c, 0.66, 0.05)
예제 #3
0
def test_quantiles():
    points = 10000
    h = StreamHist()
    for p in make_uniform(points):
        h.update(p)
    assert about(h.quantiles(0.5)[0], 0.5, 0.05)

    h = StreamHist()
    for p in make_normal(points):
        h.update(p)
    a, b, c = h.quantiles(0.25, 0.5, 0.75)
    assert about(a, -0.66, 0.05)
    assert about(b, 0.00, 0.05)
    assert about(c, 0.66, 0.05)
예제 #4
0
def test_multi_merge():
    points = 100000
    data = make_uniform(points)
    samples = [data[x:x + 100] for x in range(0, len(data), 100)]
    hists = [StreamHist().update(s) for s in samples]
    h1 = sum(hists)
    h2 = StreamHist().update(data)

    q1 = h1.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9)
    q2 = h2.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9)
    from numpy import allclose
    assert allclose(q1, q2, rtol=1, atol=0.025)
예제 #5
0
def test_multi_merge():
    points = 100000
    data = make_uniform(points)
    samples = [data[x:x+100] for x in range(0, len(data), 100)]
    hists = [StreamHist().update(s) for s in samples]
    h1 = sum(hists)
    h2 = StreamHist().update(data)

    q1 = h1.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9)
    q2 = h2.quantiles(.1, .2, .3, .4, .5, .6, .7, .8, .9)
    from numpy import allclose
    assert allclose(q1, q2, rtol=1, atol=0.025)
예제 #6
0
def test_histogram_exact():
    """A StreamHist which is not at capacity matches numpy statistics"""
    max_bins = 50
    points = [random.expovariate(1 / 5) for _ in range(max_bins)]
    h = StreamHist(max_bins)
    h.update(points)

    q = [i / 100 for i in range(101)]
    import numpy as np
    assert h.quantiles(*q) == approx(np.quantile(points, q))
    assert h.mean() == approx(np.mean(points))
    assert h.var() == approx(np.var(points))
    assert h.min() == min(points)
    assert h.max() == max(points)
    assert h.count() == max_bins
예제 #7
0
def test_histogram_approx(max_bins, num_points, expected_error):
    """Test accuracy of StreamHist over capacity, especially quantiles."""
    points = [random.expovariate(1 / 5) for _ in range(num_points)]
    h = StreamHist(max_bins)
    h.update(points)

    import numpy as np
    q = [i / 100 for i in range(101)]
    err_sum = 0  # avg percent error across samples
    for p, b, b_np, b_np_min, b_np_max in zip(
            q, h.quantiles(*q), np.quantile(points, q),
            np.quantile(points, [0] * 7 + q),
            np.quantile(points, q[7:] + [1] * 7)):
        err_denom = b_np_max - b_np_min
        err_sum += abs(b - b_np) / err_denom
    assert err_sum <= expected_error
    assert h.mean() == approx(np.mean(points))
    assert h.var() == approx(np.var(points), rel=.05)
    assert h.min() == min(points)
    assert h.max() == max(points)
    assert h.count() == num_points