def eg1(): h = HyperLogLog() for d in data1: h.update(d.encode('utf8')) print("Estimated cardinality is", h.count()) s1 = set(data1) print("Actual cardinality is", len(s1))
def run_perf(card, p): h = HyperLogLog(p=p) logging.info("HyperLogLog using p = %d " % p) start = time.clock() for i in range(card): h.update(int_bytes(i)) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p, hashobj=Hash) h2 = HyperLogLog(p=p, hashobj=Hash) for i in xrange(a_start, a_end): h1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): h2.update(hasher(data[i], seed=seed)) return _hyperloglog_jaccard(h1, h2)
def run_acc(size, seed, p): logging.info("HyperLogLog using p = %d " % p) h = HyperLogLog(p=p) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) h.update(v) s.add(v) perr = abs(float(len(s)) - h.count()) / float(len(s)) return perr
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.update(d.encode('utf8')) for d in data2: h2.update(d.encode('utf8')) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p, hashobj=Hash) for d in data: h.update(hasher(d, seed=seed)) return h.count()