else: return round(estimate) elif estimate <= 2.5 * self.m and method == 0: return round(-self.m * math.log(zeros / self.m)) else: return round(estimate) def _bitscan(self, x, m): v = 1 while v<=m and not x&0x80000000: v+=1 x<<=1 return v if __name__ == '__main__': works = list(shakespeare.each_work()) for p in range(7, 19): T0 = [] T2 = [] for name, words in works: H = HyperLogLog(p) for word in words: H.add(word) T0.append(abs(len(words) - H.cardinality(method=0))/len(words)) T2.append(abs(len(words) - H.cardinality(method=2))/len(words)) print('\t'.join(map(str, (p, statistics.mean(T2), statistics.mean(T2)+statistics.stdev(T2), max(T2)))))
def real(works): real = {} for i in range(len(works)): for j in range(i+1, len(works)): name1, words1 = works[i] name2, words2 = works[j] real[(name1, name2)] = normal_compare(words1, words2) return real if __name__ == '__main__': #shakespeare.write_duplicates() #sys.exit(0) works = list(shakespeare.each_work()) + list(shakespeare.each_work('duplicates')) real = real(works) hashes = {name: minhash(2048, A) for name, A in works} sys.stderr.write('_\n') for p in range(7, 19): hlls = {name: hll(p, A) for name, A in works} sys.stderr.write(str(p)+'\n') T = [] for i in range(len(works)): for j in range(i+1, len(works)): name1, words1 = works[i] name2, words2 = works[j]