Python MinHash.jaccard 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: datasketch.minhash

클래스/타입: MinHash

메소드/함수: jaccard

hotexamples.com에서의 예제들: 8

Python MinHash.jaccard - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 datasketch.minhash.MinHash.jaccard에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

MinHash(30)

update(30)

jaccard(6)

digest(3)

count(2)

예제 #1

파일 보기

def mh2(data1, data2):
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    return m1.jaccard(m2)

예제 #2

파일 보기

파일: similarity_benchmark.py 프로젝트: GitManager/datasketch

def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]

예제 #3

파일 보기

파일: similarity_benchmark.py 프로젝트: zjiaksmc/datasketch

def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm, hashobj=Hash)
    m2 = MinHash(num_perm=num_perm, hashobj=Hash)
    for i in xrange(a_start, a_end):
        m1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        m2.update(hasher(data[i], seed=seed))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]

예제 #4

파일 보기

파일: minhash_examples.py 프로젝트: oisincar/datasketch

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)

예제 #5

파일 보기

파일: minhash_examples.py 프로젝트: Amano-Ginji/datasketch

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)

예제 #6

파일 보기

파일: main.py 프로젝트: gosom/go-minhash

def main():
    with open('plato1.txt', 'r') as f:
        tokens1 = [l for l in f]
    with open('plato2.txt', 'r') as f:
        tokens2 = [l for l in f]

    start = time.time()
    m1 = MinHash(num_perm=64, seed=0)
    for t in tokens1:
        m1.update(t.encode('utf8'))

    m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations)
    for t in tokens2:
        m2.update(t.encode('utf8'))
    similarity = m2.jaccard(m1)
    elapsed = time.time() - start
    print("Similar %f and Took %f ms", similarity, elapsed * 1000)

예제 #7

파일 보기

    eg2()

>>> 
>>> from datasketch import MinHash

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

m1, m2 = MinHash(), MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
>>> 
>>> m = MinHash(num_perm=256)
>>> m.count()
0.0
>>> from sparselsh import LSH
from scipy.sparse import csr_matrix

X = csr_matrix( [
    [ 3, 0, 0, 0, 0, 0, -1],
    [ 0, 1, 0, 0, 0, 0,  1],

예제 #8

파일 보기

total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))
    for word in question:
        question_minHash.update(word.encode('utf8'))
    candidates = lsh.query(question_minHash)
    result = []
    #check which candidates are similar with the sentence
    for j in range(len(candidates)):
        canditade = df['question1'][int(candidates[j])]
        cand = set(tokenize_sentence(str(canditade)))
        cand_minHash = MinHash()
        for word in cand:
            cand_minHash.update(word.encode('utf8'))
        if cand_minHash.jaccard(question_minHash) >= threshold_jacard:
            result.append(str(candidates[j]))

    #statistcs
    if df['is_duplicate'][i] == 1:
        total_correct += 1
    if len(result) > 0:
        return_result += 1
    if str(i) in result:
        total += 1
        if df['is_duplicate'][i]:
            correct += 1

print("Precision {}%".format(correct / return_result * 100))
print("Recall {}%".format(correct / total_correct * 100))