예제 #1
0
def estimateDistinctElementParallel(listOfItems, num_perm):
    """Same as above, except here we have a nested for loop to iterate through the 
       lists in the list. This function will also append the estimation result 
       to a list for use in the following accuracy function."""
    h = MinHash(num_perm)
    for item in listOfItems:
        for i in item:  # nested for loop to iterate over lists within a list
            h.digest(sha1(i.encode('utf8')))
    estimate.append(h.count())
    print("Estimated number of elements: ", h.count())
예제 #2
0
def estimateDistinctElements(items, num_perm):
    """This function will estimate the number of distinct elements in a list.
       The default number of hash function permutations is num_perm(128), but 
       I asjusted after researching more-
       http://blog.cluster-text.com/tag/minhash/"""
    h = MinHash(num_perm)  # creates a minhash object with the parameter 
    for item in items:     # being the number of hash permutations
        h.digest(sha1(item.encode('utf8')))  # digests the minhash signatures 
    print("Estimated number of elements: ", h.count())
예제 #3
0
def minHash_bml(SX, SY):
    print()
    print("MinHash BML")

    l = 32
    m = 8
    num_perm = pow(2, m)
    error = pow(10, -5)

    print("Number of permutations is ", num_perm)

    m1 = MinHash(num_perm)
    m2 = MinHash(num_perm)

    for d in SX:
        m1.update(d.encode('utf8'))
    for d in SY:
        m2.update(d.encode('utf8'))

    nx = m1.count()
    ny = m2.count()
    print("Estimated nx is ", nx)
    print("Estimated ny is ", ny)

    Vx = m1.digest()
    Vy = m2.digest()

    z = 0
    for i in range(0, num_perm):
        if Vx[i] >= Vy[i]:
            z = z + 1
    P = z / num_perm

    print("P is: ", P)
    print("Inclusion Coefficient: ",
          lookup(P, 0, min(nx, ny), nx, ny, error, m, num_perm, l, 0, 0))

    return