def test_hyperloglog_small_card_est(self): reg = np.array([1 for i in range(1 << 4)], dtype=np.int8) with patch.object(HyperLogLog, '_linearcounting') as mock_method: mock_method.return_value = 0 h = HyperLogLog(reg=reg) h.count() self.assertTrue(mock_method.called)
def test_hyperloglog_large_card_est(self): reg = np.array([27 for i in range(1 << 4)], dtype=np.int8) with patch.object(HyperLogLog, '_largerange_correction') as mock_method: mock_method.return_value = 0 h = HyperLogLog(reg=reg) h.count() self.assertTrue(mock_method.called)
def eg1(): h = HyperLogLog() for d in data1: h.digest(sha1(d.encode('utf8'))) print("Estimated cardinality is", h.count()) s1 = set(data1) print("Actual cardinality is", len(s1))
def test_merge(self): h1 = HyperLogLog(4) h2 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h1.merge(h2) self.assertEqual(h1.reg[0b1111], 64 - 4 + 1) self.assertEqual(h1.reg[1], 1)
def test_count(self): h = HyperLogLog(4) h.digest(FakeHash(0b0001111)) h.digest(FakeHash(0xfffffffffffffff1)) h.digest(FakeHash(0xfffffff5)) # We can't really verify the correctness here, just to make sure # no syntax error # See benchmarks for the accuracy of the cardinality estimation. h.count()
def run_perf(card, p): h = HyperLogLog(p=p) logging.info("HyperLogLog using p = %d " % p) start = time.clock() for i in range(card): h.update(int_bytes(i)) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def run_acc(size, seed, p): logging.info("HyperLogLog using p = %d " % p) h = HyperLogLog(p=p) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) h.update(v) s.add(v) perr = abs(float(len(s)) - h.count()) / float(len(s)) return perr
def test_merge(self): h1 = HyperLogLog(4) h2 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h1.merge(h2) self.assertEqual(h1.reg[0b1111], 32 - 4) self.assertEqual(h1.reg[1], 1)
def test_digest(self): h = HyperLogLog(4) h.digest(FakeHash(0b0001111)) self.assertEqual(h.reg[0b1111], 64 - 4 + 1) h.digest(FakeHash(0xfffffffffffffff1)) self.assertEqual(h.reg[1], 1) h.digest(FakeHash(0xfffffff5)) self.assertEqual(h.reg[5], 33)
def _hyperloglog_jaccard(h1, h2): c1 = h1.count() c2 = h2.count() uc = HyperLogLog.union(h1, h2).count() if uc == 0.0: return 1.0 ic = c1 + c2 - uc return ic / uc
def _hyperloglog_inclusion(h1, h2): c1 = h1.count() if c1 == 0.0: return 1.0 c2 = h2.count() uc = HyperLogLog.union(h1, h2).count() ic = c1 + c2 - uc return ic / c1
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p, hashobj=Hash) h2 = HyperLogLog(p=p, hashobj=Hash) for i in xrange(a_start, a_end): h1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): h2.update(hasher(data[i], seed=seed)) return _hyperloglog_jaccard(h1, h2)
def test_pickle(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffffffffffff1)) p = pickle.loads(pickle.dumps(h)) self.assertEqual(p.m, h.m) self.assertEqual(p.p, h.p) self.assertEqual(p.reg, h.reg)
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.update(d.encode('utf8')) for d in data2: h2.update(d.encode('utf8')) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))
def test_digest(self): h = HyperLogLog(4) h.digest(FakeHash(0b00011111)) self.assertEqual(h.reg[0b1111], 32 - 4) h.digest(FakeHash(0xfffffff1)) self.assertEqual(h.reg[1], 1) h.digest(FakeHash(0x000000f5)) self.assertEqual(h.reg[5], 32 - 4 - 3)
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p) h2 = HyperLogLog(p=p) for i in xrange(a_start, a_end): h1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): h2.digest(Hash(hasher(data[i], seed=seed))) return h1.inclusion(h2)
def test_pickle(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xffffff1)) p = pickle.loads(pickle.dumps(h)) self.assertEqual(p.m, h.m) self.assertEqual(p.p, h.p) self.assertEqual(p.reg, h.reg)
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.digest(sha1(d.encode('utf8'))) for d in data2: h2.digest(sha1(d.encode('utf8'))) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def test_init_from_reg(self): reg = [1 for _ in range(1 << 4)] h = HyperLogLog(reg=reg) self.assertEqual(h.p, 4) h2 = HyperLogLog(p=4) self.assertEqual(h.p, h2.p)
def test_init(self): h = HyperLogLog(4) self.assertEqual(h.m, 1 << 4) self.assertEqual(len(h.reg), h.m) self.assertTrue(all(0 == i for i in h.reg))
def test_deserialize(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffff1)) buf = bytearray(h.bytesize()) h.serialize(buf) hd = HyperLogLog.deserialize(buf) self.assertEqual(hd.p, h.p) self.assertEqual(hd.m, h.m) self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
def test_serialize(self): h = HyperLogLog(4) buf = bytearray(h.bytesize()) h.serialize(buf) self.assertEqual(h.p, struct.unpack_from('B', buf, 0)[0])
def test_inclusion(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.inclusion(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.inclusion(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertEqual(int(h1.inclusion(h2)), 1)
def test_inclusion(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.inclusion(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.inclusion(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertEqual(int(h1.inclusion(h2)), 1)
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p) for d in data: h.digest(Hash(hasher(d, seed=seed))) return h.count()
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def test_intersection_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.intersection_count(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.intersection_count(h2)), 3)
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p, hashobj=Hash) for d in data: h.update(hasher(d, seed=seed)) return h.count()
def test_jaccard(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.jaccard(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.jaccard(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.jaccard(h2), 1)
def test_intersection_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.intersection_count(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.intersection_count(h2)), 3)
def test_jaccard(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.jaccard(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.jaccard(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.jaccard(h2), 1)
def test_deserialize(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffffffffffff1)) buf = bytearray(h.bytesize()) h.serialize(buf) hd = HyperLogLog.deserialize(buf) self.assertEqual(hd.p, h.p) self.assertEqual(hd.m, h.m) self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))