def _hyperloglog_inclusion(h1, h2): c1 = h1.count() if c1 == 0.0: return 1.0 c2 = h2.count() uc = HyperLogLog.union(h1, h2).count() ic = c1 + c2 - uc return ic / c1
def _hyperloglog_jaccard(h1, h2): c1 = h1.count() c2 = h2.count() uc = HyperLogLog.union(h1, h2).count() if uc == 0.0: return 1.0 ic = c1 + c2 - uc return ic / uc
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.digest(sha1(d.encode('utf8'))) for d in data2: h2.digest(sha1(d.encode('utf8'))) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.update(d.encode('utf8')) for d in data2: h2.update(d.encode('utf8')) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))