def _run_acc(size, seed, num_perm): m = MinHash(num_perm=num_perm) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) m.digest(sha1(v)) s.add(v) return (m, s)
def run_perf(card, num_perm): m = MinHash(num_perm=num_perm) logging.info("MinHash using %d permutation functions" % num_perm) start = time.clock() for i in range(card): m.digest(sha1(int_bytes(i))) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def _run_minhash(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=2**p) m2 = MinHash(num_perm=2**p) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return _minhash_inclusion(m1, m2)
def _run_minhash(A, B, data, seed, num_perm): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return jaccard([m1, m2])
def _run_minhash(A, B, data, seed, bs, num_perm): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return [m1.jaccard(m2)] + \ [_b_bit_minhash_jaccard(m1, m2, b) for b in bs]
def dict_to_minhash(v): """ Generates a Minhash for a dict object :param v: dictionary :return: minhash """ m_ = MinHash() tokens = my_tokenizer(v) for t in tokens: m_.digest(sha1(t.encode('utf8'))) return m_
def test_pickle(self): lsh = LSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.digest(sha1("a".encode("utf8"))) m2 = MinHash(16) m2.digest(sha1("b".encode("utf8"))) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.digest(sha1(d.encode('utf8'))) for d in data2: m2.digest(sha1(d.encode('utf8'))) print("Estimated Jaccard for data1 and data2 is", jaccard(m1, m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.digest(sha1(d.encode('utf8'))) for d in data2: m2.digest(sha1(d.encode('utf8'))) print("Estimated Jaccard for data1 and data2 is", jaccard([m1, m2])) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def test_query(self): lsh = LSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.digest(sha1("a".encode("utf8"))) m2 = MinHash(16) m2.digest(sha1("b".encode("utf8"))) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
def test_insert(self): lsh = LSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.digest(sha1("a".encode("utf8"))) m2 = MinHash(16) m2.digest(sha1("b".encode("utf8"))) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def eg1(): m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.digest(sha1(d.encode('utf8'))) for d in data2: m2.digest(sha1(d.encode('utf8'))) for d in data3: m3.digest(sha1(d.encode('utf8'))) # Create LSH index lsh = LSH(threshold=0.5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result)
def _run_minhash(data, seed, p): hasher = pyhash.murmur3_32() m = MinHash(num_perm=2**p) for d in data: m.digest(Hash(hasher(d, seed=seed))) return m.count()