def test(cls, count=100000, registers=512): hll = HyperLogLog(registers) for i in range(count): #r = int(random.random() * 100000000) r = "".join([string.ascii_letters[random.randint(0, len(string.ascii_letters)-1)] for n in range(30)]) hll.add(str(r)) print hll.getEstimatedCardinality()
def test_init(self): s = HyperLogLog(0.05) s.upgrade() self.assertEqual(s.p, 9) self.assertEqual(s.alpha, 0.7197831133217303) self.assertEqual(s.m, 512) self.assertEqual(len(s.M), 512)
def test_pickle(self): a = HyperLogLog(0.05) for x in range(100): a.add(str(x)) b = pickle.loads(pickle.dumps(a)) self.assertEqual(a.M, b.M) self.assertEqual(a.alpha, b.alpha) self.assertEqual(a.p, b.p) self.assertEqual(a.m, b.m)
class TestHyperLogLog(unittest.TestCase, BasicHLLTests): def setUp(self): self.hll = HyperLogLog(16, 16) self.hll2 = HyperLogLog(16, 16) def test_a_repeated_element_is_ignored(self): self.hll.add_object(37) self.hll2.add_objects([ 37 for x in range(0, 1000) ]) self.assertEqual(self.hll.logs, self.hll2.logs)
def test_add(self): s = HyperLogLog(0.05) for i in range(10): s.add(str(i)) M = [(i, v) for i, v in enumerate(s.M) if v > 0] self.assertEqual(M, [(31, 1), (120, 1), (122, 4), (151, 5), (171, 3), (176, 1), (196, 1), (268, 1), (443, 2), (474, 1)])
def test_add(self): s = HyperLogLog(0.05) for i in range(10): s.add(str(i)) M = [(i, v) for i, v in enumerate(s.M) if v > 0] self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
def test_pickle(self): a = HyperLogLog(0.05) for x in range(100): a.add(str(x)) a.upgrade() b = pickle.loads(pickle.dumps(a)) numpy.testing.assert_array_equal(a.M, b.M) self.assertEqual(a.alpha, b.alpha) self.assertEqual(a.p, b.p) self.assertEqual(a.m, b.m) self.assertEqual(len(a), len(b))
def test_calc_cardinality(self): for cardinality in (1, 2, 3, 5, 10, 1500, 100000, 1000000): a = HyperLogLog(0.05) for i in xrange(cardinality): a.add(str(i)) #print cardinality, len(a), a.m, cardinality * (1.0 - 1.04 / math.sqrt(a.m)), cardinality * (1.0 + 1.04 / math.sqrt(a.m)) self.assertGreater(len(a), cardinality * (1.0 - 1.04 / math.sqrt(a.m))) self.assertLess(len(a), cardinality * (1.0 + 1.04 / math.sqrt(a.m)))
def test_hll(self): m = 16384 f1 = tempfile.NamedTemporaryFile('r+b') f1.write(''.join('\x00' for i in range(m))) f1.flush() mfile1 = mmap.mmap(f1.fileno(),0) mslice1 = MmapSlice(mfile1, m, 0) test = HyperLogLog(0.01, mslice1) self.assertEqual(len(test), 0) test.add('test_val') self.assertEqual(len(test), 1)
def test_hll(self): m = 16384 f1 = tempfile.NamedTemporaryFile('r+b') f1.write(''.join('\x00' for i in range(m))) f1.flush() mfile1 = mmap.mmap(f1.fileno(), 0) mslice1 = MmapSlice(mfile1, m, 0) test = HyperLogLog(0.01, mslice1) self.assertEqual(len(test), 0) test.add('test_val') self.assertEqual(len(test), 1)
def test_add(self): s = HyperLogLog(0.05) for i in range(10): s.add(str(i)) s.upgrade() M = [(i, v) for i, v in enumerate(s.M) if v > 0] numpy.testing.assert_array_equal( M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
def test_3(self): size = int(1e5) data = np.random.randint(low=(-1)*int(1e10), high=int(1e10), size=size) b = int(np.log2(size)) # Num bits needed. hll = HyperLogLog(hash_fn=hashlib.sha1, num_bits=b) my_set = set() for val in data: hll.add(val) my_set.add(val) my_set_n = len(my_set) hll_n = hll.get_num_distinct() error_rate = np.abs(float(my_set_n-hll_n)/float(my_set_n+hll_n)) accepted_error_rate = np.abs(standard_error(2**b)) assert(error_rate <= accepted_error_rate)
def test_update(self): f = tempfile.TemporaryFile() m = 16384 flen = (m*3) + mmap.PAGESIZE - (m*3) % mmap.PAGESIZE self.assertGreater(flen, m*3) f.write(''.join(['\x00' for i in range(flen)])) fmap = mmap.mmap(f.fileno(), m*3) self.assertEqual(len(fmap), m*3) mslice1 = MmapSlice(fmap, m, 0) mslice2 = MmapSlice(fmap, m, m) mslice3 = MmapSlice(fmap, m, m*2) hll1 = HyperLogLog(self.error_rate, mslice1) hll2 = HyperLogLog(self.error_rate, mslice2) hll3 = HyperLogLog(self.error_rate, mslice3) for v in self.test_data1: hll1.add(v) for v in self.test_data2: hll2.add(v) hll1.update(hll2) self.assertAlmostEqual(self.test_set_size*2, len(hll1), delta=self.test_set_size*2*self.error_rate)
def test_mmap(self): f = tempfile.TemporaryFile() m = 16384 flen = m + mmap.PAGESIZE - m % mmap.PAGESIZE self.assertGreater(flen, m) f.write(''.join(['\x00' for i in range(flen)])) fmap = mmap.mmap(f.fileno(), m) self.assertEqual(len(fmap), m) mslice = MmapSlice(fmap, m) hll = HyperLogLog(self.error_rate, mslice) for v in self.test_data1: hll.add(v) self.assertAlmostEqual(self.test_set_size, len(hll), delta=self.test_set_size*self.error_rate)
def test_calc_cardinality(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in xrange(n): a = HyperLogLog(rel_err) for i in xrange(card): a.add(os.urandom(20)) s += a.card() z = (float(s) / n - card) / (rel_err * card / math.sqrt(n)) self.assertLess(-1.96, z) self.assertGreater(1.96, z)
def test_mmap(self): f = tempfile.TemporaryFile() m = 16384 flen = m + mmap.PAGESIZE - m % mmap.PAGESIZE self.assertGreater(flen, m) f.write(''.join(['\x00' for i in range(flen)])) fmap = mmap.mmap(f.fileno(), m) self.assertEqual(len(fmap), m) mslice = MmapSlice(fmap, m) hll = HyperLogLog(self.error_rate, mslice) for v in self.test_data1: hll.add(v) self.assertAlmostEqual(self.test_set_size, len(hll), delta=self.test_set_size * self.error_rate)
def test_alpha(self): alpha = [HyperLogLog._get_alpha(b) for b in range(4, 10)] self.assertEqual(alpha, [0.673, 0.697, 0.709, 0.7152704932638152, 0.7182725932495458, 0.7197831133217303])
def test_update(self): a = HyperLogLog(0.05) b = HyperLogLog(0.05) c = HyperLogLog(0.05) for i in xrange(2): a.add(str(i)) c.add(str(i)) for i in xrange(2, 4): b.add(str(i)) c.add(str(i)) a.update(b) self.assertNotEqual(a, b) self.assertNotEqual(b, c) self.assertEqual(a, c)
def setUp(self): self.hll = HyperLogLog(16, 16) self.hll2 = HyperLogLog(16, 16)
def test_init(self): s = HyperLogLog(0.05) self.assertEqual(s.p, 9) self.assertEqual(s.alpha, 0.7197831133217303) self.assertEqual(s.m, 512) self.assertEqual(len(s.M), 512)
import logging from hll import HyperLogLog, MartingaleHyperLogLog logging.basicConfig(level=logging.DEBUG) hll = HyperLogLog(10, 54) mhll = MartingaleHyperLogLog(10, 54) f = open('random_ints', 'r') for l in f.readlines(): i = int(l) hll.add_object(i) mhll.add_object(i) print hll.unadjusted_count print mhll.count
def test_update_err(self): a = HyperLogLog(0.05) b = HyperLogLog(0.01) self.assertRaises(ValueError, a.update, b)
def test_update3(self): f = tempfile.TemporaryFile() m = 16384 flen = (m * 3) + mmap.PAGESIZE - (m * 3) % mmap.PAGESIZE self.assertGreater(flen, m * 3) f.write(''.join(['\x00' for i in range(flen)])) fmap = mmap.mmap(f.fileno(), m * 3) self.assertEqual(len(fmap), m * 3) mslice1 = MmapSlice(fmap, m, 0) mslice2 = MmapSlice(fmap, m, m) mslice3 = MmapSlice(fmap, m, m * 2) hll1 = HyperLogLog(self.error_rate, mslice1) hll2 = HyperLogLog(self.error_rate, mslice2) hll3 = HyperLogLog(self.error_rate, mslice3) for v in self.test_data1: hll1.add(v) for v in self.test_data2: hll2.add(v) for v in self.test_data3: hll3.add(v) hll1.update([hll2, hll3]) self.assertAlmostEqual(self.test_set_size * 3, len(hll1), delta=self.test_set_size * 3 * self.error_rate)
from unittest import TestCase from hll import HyperLogLog import math class HyperLogLogTestCase(TestCase): def test_alpha(self): alpha = [HyperLogLog._get_alpha(b) for b in range(4, 10)] self.assertEqual(alpha, [0.673, 0.697, 0.709, 0.7152704932638152, 0.7182725932495458, 0.7197831133217303]) def test_alpha_bad(self): self.assertRaises(ValueError, HyperLogLog._get_alpha, 1) self.assertRaises(ValueError,HyperLogLog. _get_alpha, 17) def test_rho(self): arr = [ 1L << i for i in range(32 + 1) ] self.assertEqual(HyperLogLog._get_rho(0, arr), 33) self.assertEqual(HyperLogLog._get_rho(1, arr), 32) self.assertEqual(HyperLogLog._get_rho(2, arr), 31) self.assertEqual(HyperLogLog._get_rho(3, arr), 31) self.assertEqual(HyperLogLog._get_rho(4, arr), 30) self.assertEqual(HyperLogLog._get_rho(5, arr), 30) self.assertEqual(HyperLogLog._get_rho(6, arr), 30) self.assertEqual(HyperLogLog._get_rho(7, arr), 30) self.assertEqual(HyperLogLog._get_rho(1 << 31, arr), 1) self.assertRaises(ValueError, HyperLogLog._get_rho, 1 << 32, arr) def test_init(self): s = HyperLogLog(0.05) self.assertEqual(s.b, 9) self.assertEqual(s.alpha, 0.7197831133217303) self.assertEqual(s.m, 512)