예제 #1
0
def mh2(data1, data2):
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    return m1.jaccard(m2)
예제 #2
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
예제 #3
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm, hashobj=Hash)
    m2 = MinHash(num_perm=num_perm, hashobj=Hash)
    for i in xrange(a_start, a_end):
        m1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        m2.update(hasher(data[i], seed=seed))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
예제 #4
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #5
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #6
0
파일: main.py 프로젝트: gosom/go-minhash
def main():
    with open('plato1.txt', 'r') as f:
        tokens1 = [l for l in f]
    with open('plato2.txt', 'r') as f:
        tokens2 = [l for l in f]

    start = time.time()
    m1 = MinHash(num_perm=64, seed=0)
    for t in tokens1:
        m1.update(t.encode('utf8'))

    m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations)
    for t in tokens2:
        m2.update(t.encode('utf8'))
    similarity = m2.jaccard(m1)
    elapsed = time.time() - start
    print("Similar %f and Took %f ms", similarity, elapsed * 1000)
예제 #7
0
    eg2()

>>> 
>>> from datasketch import MinHash

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

m1, m2 = MinHash(), MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
>>> 
>>> m = MinHash(num_perm=256)
>>> m.count()
0.0
>>> from sparselsh import LSH
from scipy.sparse import csr_matrix

X = csr_matrix( [
    [ 3, 0, 0, 0, 0, 0, -1],
    [ 0, 1, 0, 0, 0, 0,  1],
예제 #8
0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))
    for word in question:
        question_minHash.update(word.encode('utf8'))
    candidates = lsh.query(question_minHash)
    result = []
    #check which candidates are similar with the sentence
    for j in range(len(candidates)):
        canditade = df['question1'][int(candidates[j])]
        cand = set(tokenize_sentence(str(canditade)))
        cand_minHash = MinHash()
        for word in cand:
            cand_minHash.update(word.encode('utf8'))
        if cand_minHash.jaccard(question_minHash) >= threshold_jacard:
            result.append(str(candidates[j]))

    #statistcs
    if df['is_duplicate'][i] == 1:
        total_correct += 1
    if len(result) > 0:
        return_result += 1
    if str(i) in result:
        total += 1
        if df['is_duplicate'][i]:
            correct += 1

print("Precision {}%".format(correct / return_result * 100))
print("Recall {}%".format(correct / total_correct * 100))