예제 #1
0
            else:
                return round(estimate)                
        elif estimate <= 2.5 * self.m and method == 0:
            return round(-self.m * math.log(zeros / self.m))
        else:
            return round(estimate)
        
    def _bitscan(self, x, m):
        v = 1
        while v<=m and not x&0x80000000:
            v+=1
            x<<=1
        return v

if __name__ == '__main__':
    works = list(shakespeare.each_work())
    for p in range(7, 19):
        T0 = []
        T2 = []
        for name, words in works:
            H = HyperLogLog(p)
            for word in words:
                H.add(word)
            T0.append(abs(len(words) - H.cardinality(method=0))/len(words))
            T2.append(abs(len(words) - H.cardinality(method=2))/len(words))
        print('\t'.join(map(str, (p, statistics.mean(T2), statistics.mean(T2)+statistics.stdev(T2), max(T2)))))
    

            
            
            
예제 #2
0

def real(works):
    real = {}
    for i in range(len(works)):
        for j in range(i+1, len(works)):
            name1, words1 = works[i]
            name2, words2 = works[j]
            real[(name1, name2)] = normal_compare(words1, words2)
    return real

if __name__ == '__main__':
    #shakespeare.write_duplicates()
    #sys.exit(0)

    works = list(shakespeare.each_work()) + list(shakespeare.each_work('duplicates'))

    real = real(works)
    
    hashes = {name: minhash(2048, A) for name, A in works}

    sys.stderr.write('_\n')
    for p in range(7, 19):
        hlls = {name: hll(p, A) for name, A in works}

        sys.stderr.write(str(p)+'\n')
        T = []
        for i in range(len(works)):
            for j in range(i+1, len(works)):
                name1, words1 = works[i]
                name2, words2 = works[j]