示例#1
0
 def test_hyperloglog_small_card_est(self):
     reg = np.array([1 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_linearcounting') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
 def test_hyperloglog_large_card_est(self):
     reg = np.array([27 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_largerange_correction') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
 def test_hyperloglog_small_card_est(self):
     reg = np.array([1 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_linearcounting') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
示例#4
0
 def test_hyperloglog_large_card_est(self):
     reg = np.array([27 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_largerange_correction') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
示例#5
0
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
示例#6
0
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
示例#7
0
 def test_merge(self):
     h1 = HyperLogLog(4)
     h2 = HyperLogLog(4)
     h1.digest(FakeHash(0b0001111))
     h2.digest(FakeHash(0xfffffffffffffff1))
     h1.merge(h2)
     self.assertEqual(h1.reg[0b1111], 64 - 4 + 1)
     self.assertEqual(h1.reg[1], 1)
示例#8
0
 def test_count(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b0001111))
     h.digest(FakeHash(0xfffffffffffffff1))
     h.digest(FakeHash(0xfffffff5))
     # We can't really verify the correctness here, just to make sure
     # no syntax error
     # See benchmarks for the accuracy of the cardinality estimation.
     h.count()
def run_perf(card, p):
    h = HyperLogLog(p=p)
    logging.info("HyperLogLog using p = %d " % p)
    start = time.clock()
    for i in range(card):
        h.update(int_bytes(i))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
def run_acc(size, seed, p):
    logging.info("HyperLogLog using p = %d " % p)
    h = HyperLogLog(p=p)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        h.update(v)
        s.add(v)
    perr = abs(float(len(s)) - h.count()) / float(len(s))
    return perr
示例#11
0
 def test_merge(self):
     h1 = HyperLogLog(4)
     h2 = HyperLogLog(4)
     h1.digest(FakeHash(0b00011111))
     h2.digest(FakeHash(0xfffffff1))
     h1.merge(h2)
     self.assertEqual(h1.reg[0b1111], 32 - 4)
     self.assertEqual(h1.reg[1], 1)
示例#12
0
 def test_digest(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b0001111))
     self.assertEqual(h.reg[0b1111], 64 - 4 + 1)
     h.digest(FakeHash(0xfffffffffffffff1))
     self.assertEqual(h.reg[1], 1)
     h.digest(FakeHash(0xfffffff5))
     self.assertEqual(h.reg[5], 33)
def _hyperloglog_jaccard(h1, h2):
    c1 = h1.count()
    c2 = h2.count()
    uc = HyperLogLog.union(h1, h2).count()
    if uc == 0.0:
        return 1.0
    ic = c1 + c2 - uc
    return ic / uc
示例#14
0
def _hyperloglog_jaccard(h1, h2):
    c1 = h1.count()
    c2 = h2.count()
    uc = HyperLogLog.union(h1, h2).count()
    if uc == 0.0:
        return 1.0
    ic = c1 + c2 - uc
    return ic / uc
示例#15
0
def _hyperloglog_inclusion(h1, h2):
    c1 = h1.count()
    if c1 == 0.0:
        return 1.0
    c2 = h2.count()
    uc = HyperLogLog.union(h1, h2).count()
    ic = c1 + c2 - uc
    return ic / c1
示例#16
0
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_jaccard(h1, h2)
示例#17
0
 def test_pickle(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffffffffffff1))
     p = pickle.loads(pickle.dumps(h))
     self.assertEqual(p.m, h.m)
     self.assertEqual(p.p, h.p)
     self.assertEqual(p.reg, h.reg)
示例#18
0
def eg2():
    h1 = HyperLogLog()
    h2 = HyperLogLog()
    for d in data1:
        h1.update(d.encode('utf8'))
    for d in data2:
        h2.update(d.encode('utf8'))
    u = HyperLogLog.union(h1, h2)
    print("Estimated union cardinality is", u.count())

    s1 = set(data1)
    s2 = set(data2)
    su = s1.union(s2)
    print("Actual union cardinality is", len(su))
示例#19
0
 def test_digest(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b00011111))
     self.assertEqual(h.reg[0b1111], 32 - 4)
     h.digest(FakeHash(0xfffffff1))
     self.assertEqual(h.reg[1], 1)
     h.digest(FakeHash(0x000000f5))
     self.assertEqual(h.reg[5], 32 - 4 - 3)
示例#20
0
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p)
    h2 = HyperLogLog(p=p)
    for i in xrange(a_start, a_end):
        h1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        h2.digest(Hash(hasher(data[i], seed=seed)))
    return h1.inclusion(h2)
示例#21
0
 def test_pickle(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xffffff1))
     p = pickle.loads(pickle.dumps(h))
     self.assertEqual(p.m, h.m)
     self.assertEqual(p.p, h.p)
     self.assertEqual(p.reg, h.reg)
示例#22
0
def eg2():
    h1 = HyperLogLog()
    h2 = HyperLogLog()
    for d in data1:
        h1.digest(sha1(d.encode('utf8')))
    for d in data2:
        h2.digest(sha1(d.encode('utf8')))
    u = HyperLogLog.union(h1, h2)
    print("Estimated union cardinality is", u.count())

    s1 = set(data1)
    s2 = set(data2)
    su = s1.union(s2)
    print("Actual union cardinality is", len(su))
示例#23
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
示例#24
0
 def test_init_from_reg(self):
     reg = [1 for _ in range(1 << 4)]
     h = HyperLogLog(reg=reg)
     self.assertEqual(h.p, 4)
     h2 = HyperLogLog(p=4)
     self.assertEqual(h.p, h2.p)
示例#25
0
 def test_init(self):
     h = HyperLogLog(4)
     self.assertEqual(h.m, 1 << 4)
     self.assertEqual(len(h.reg), h.m)
     self.assertTrue(all(0 == i for i in h.reg))
示例#26
0
 def test_deserialize(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffff1))
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     hd = HyperLogLog.deserialize(buf)
     self.assertEqual(hd.p, h.p)
     self.assertEqual(hd.m, h.m)
     self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
示例#27
0
 def test_serialize(self):
     h = HyperLogLog(4)
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     self.assertEqual(h.p, struct.unpack_from('B', buf, 0)[0])
示例#28
0
    def test_inclusion(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.inclusion(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.inclusion(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertEqual(int(h1.inclusion(h2)), 1)
示例#29
0
 def test_serialize(self):
     h = HyperLogLog(4)
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     self.assertEqual(h.p, struct.unpack_from('B', buf, 0)[0])
示例#30
0
    def test_inclusion(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.inclusion(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.inclusion(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertEqual(int(h1.inclusion(h2)), 1)
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p)
    for d in data:
        h.digest(Hash(hasher(d, seed=seed)))
    return h.count()
示例#32
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
示例#33
0
    def test_intersection_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.intersection_count(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.intersection_count(h2)), 3)
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p, hashobj=Hash)
    for d in data:
        h.update(hasher(d, seed=seed))
    return h.count()
示例#35
0
    def test_jaccard(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.jaccard(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.jaccard(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.jaccard(h2), 1)
示例#36
0
    def test_intersection_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.intersection_count(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.intersection_count(h2)), 3)
示例#37
0
    def test_jaccard(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.jaccard(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.jaccard(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.jaccard(h2), 1)
示例#38
0
 def test_deserialize(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffffffffffff1))
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     hd = HyperLogLog.deserialize(buf)
     self.assertEqual(hd.p, h.p)
     self.assertEqual(hd.m, h.m)
     self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))