def estimateDistinctElementParallel(listOfItems, num_perm): """Same as above, except here we have a nested for loop to iterate through the lists in the list. This function will also append the estimation result to a list for use in the following accuracy function.""" h = MinHash(num_perm) for item in listOfItems: for i in item: # nested for loop to iterate over lists within a list h.digest(sha1(i.encode('utf8'))) estimate.append(h.count()) print("Estimated number of elements: ", h.count())
def estimateDistinctElements(items, num_perm): """This function will estimate the number of distinct elements in a list. The default number of hash function permutations is num_perm(128), but I asjusted after researching more- http://blog.cluster-text.com/tag/minhash/""" h = MinHash(num_perm) # creates a minhash object with the parameter for item in items: # being the number of hash permutations h.digest(sha1(item.encode('utf8'))) # digests the minhash signatures print("Estimated number of elements: ", h.count())
def minHash_bml(SX, SY): print() print("MinHash BML") l = 32 m = 8 num_perm = pow(2, m) error = pow(10, -5) print("Number of permutations is ", num_perm) m1 = MinHash(num_perm) m2 = MinHash(num_perm) for d in SX: m1.update(d.encode('utf8')) for d in SY: m2.update(d.encode('utf8')) nx = m1.count() ny = m2.count() print("Estimated nx is ", nx) print("Estimated ny is ", ny) Vx = m1.digest() Vy = m2.digest() z = 0 for i in range(0, num_perm): if Vx[i] >= Vy[i]: z = z + 1 P = z / num_perm print("P is: ", P) print("Inclusion Coefficient: ", lookup(P, 0, min(nx, ny), nx, ny, error, m, num_perm, l, 0, 0)) return