def mh2(data1, data2): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) return m1.jaccard(m2)
def _run_minhash(A, B, data, seed, num_perm, b): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
def _run_minhash(A, B, data, seed, num_perm, b): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm, hashobj=Hash) m2 = MinHash(num_perm=num_perm, hashobj=Hash) for i in xrange(a_start, a_end): m1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): m2.update(hasher(data[i], seed=seed)) return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def main(): with open('plato1.txt', 'r') as f: tokens1 = [l for l in f] with open('plato2.txt', 'r') as f: tokens2 = [l for l in f] start = time.time() m1 = MinHash(num_perm=64, seed=0) for t in tokens1: m1.update(t.encode('utf8')) m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations) for t in tokens2: m2.update(t.encode('utf8')) similarity = m2.jaccard(m1) elapsed = time.time() - start print("Similar %f and Took %f ms", similarity, elapsed * 1000)
eg2() >>> >>> from datasketch import MinHash data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets'] data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard) >>> >>> m = MinHash(num_perm=256) >>> m.count() 0.0 >>> from sparselsh import LSH from scipy.sparse import csr_matrix X = csr_matrix( [ [ 3, 0, 0, 0, 0, 0, -1], [ 0, 1, 0, 0, 0, 0, 1],
total_correct = 0 #for each sentense in column question2 find similar questions for i in range(0, total_questions): question_minHash = MinHash() question = tokenize_sentence(str(df['question2'][i])) for word in question: question_minHash.update(word.encode('utf8')) candidates = lsh.query(question_minHash) result = [] #check which candidates are similar with the sentence for j in range(len(candidates)): canditade = df['question1'][int(candidates[j])] cand = set(tokenize_sentence(str(canditade))) cand_minHash = MinHash() for word in cand: cand_minHash.update(word.encode('utf8')) if cand_minHash.jaccard(question_minHash) >= threshold_jacard: result.append(str(candidates[j])) #statistcs if df['is_duplicate'][i] == 1: total_correct += 1 if len(result) > 0: return_result += 1 if str(i) in result: total += 1 if df['is_duplicate'][i]: correct += 1 print("Precision {}%".format(correct / return_result * 100)) print("Recall {}%".format(correct / total_correct * 100))