Пример #1
0
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.digest(sha1(v))
        s.add(v)
    return (m, s)
Пример #2
0
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.digest(sha1(v))
        s.add(v)
    return (m, s)
Пример #3
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.digest(sha1(int_bytes(i)))
    duration = time.clock() - start 
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
Пример #4
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.digest(sha1(int_bytes(i)))
    duration = time.clock() - start 
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
Пример #5
0
def _run_minhash(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=2**p)
    m2 = MinHash(num_perm=2**p)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return _minhash_inclusion(m1, m2)
Пример #6
0
def _run_minhash(A, B, data, seed, num_perm):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return jaccard([m1, m2])
def _run_minhash(A, B, data, seed, bs, num_perm):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2)] + \
            [_b_bit_minhash_jaccard(m1, m2, b) for b in bs]
Пример #8
0
def dict_to_minhash(v):
    """
    Generates a Minhash for a dict object
    :param v: dictionary
    :return: minhash
    """
    m_ = MinHash()
    tokens = my_tokenizer(v)
    for t in tokens:
        m_.digest(sha1(t.encode('utf8')))
    return m_
Пример #9
0
 def test_pickle(self):
     lsh = LSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.digest(sha1("a".encode("utf8")))
     m2 = MinHash(16)
     m2.digest(sha1("b".encode("utf8")))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Пример #10
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    print("Estimated Jaccard for data1 and data2 is", jaccard(m1, m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
Пример #11
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    print("Estimated Jaccard for data1 and data2 is", jaccard([m1, m2]))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
Пример #12
0
 def test_query(self):
     lsh = LSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.digest(sha1("a".encode("utf8")))
     m2 = MinHash(16)
     m2.digest(sha1("b".encode("utf8")))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
     
     m3 = MinHash(18)
     self.assertRaises(ValueError, lsh.query, m3)
Пример #13
0
    def test_insert(self):
        lsh = LSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.digest(sha1("a".encode("utf8")))
        m2 = MinHash(16)
        m2.digest(sha1("b".encode("utf8")))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Пример #14
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    for d in data3:
        m3.digest(sha1(d.encode('utf8')))

    # Create LSH index
    lsh = LSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
Пример #15
0
def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p)
    for d in data:
        m.digest(Hash(hasher(d, seed=seed)))
    return m.count()