예제 #1
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.digest(sha1(int_bytes(i)))
    duration = time.clock() - start 
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
예제 #2
0
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.digest(sha1(v))
        s.add(v)
    return (m, s)
예제 #3
0
def dict_to_minhash(v):
    """
    Generates a Minhash for a dict object
    :param v: dictionary
    :return: minhash
    """
    m_ = MinHash()
    tokens = my_tokenizer(v)
    for t in tokens:
        m_.digest(sha1(t.encode('utf8')))
    return m_
예제 #4
0
def minhash_from_text(text, num_perm, delimiters):
    """Calculate minhash of text.

    Args:
        text: string to calculate minhash of.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        delimiters: list of strings used as delimiters for splitting text
            into words.

    Returns:
        A minhash (instance of datasketch.minhash.MinHash)
    """
    minhash = MinHash(num_perm)
    for word in _shingles_from_text(text, delimiters):
        minhash.update(word.encode('utf8'))
    return minhash
예제 #5
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    for d in data3:
        m3.digest(sha1(d.encode('utf8')))

    # Create LSH index
    lsh = LSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
예제 #6
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
예제 #7
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest.query(m1, 1)
     self.assertTrue("a" in result)
     result = forest.query(m2, 1)
     self.assertTrue("b" in result)
예제 #8
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    print("Estimated Jaccard for data1 and data2 is", jaccard([m1, m2]))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #9
0
 async def test_get_counts_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.5,
                                num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         await lsh.insert("a", m1)
         await lsh.insert("b", m2)
         counts = await lsh.get_counts()
         self.assertEqual(len(counts), lsh.b)
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
예제 #10
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
            await lsh2.close()
예제 #11
0
    def test_query(self):
        m1 = MinHash()
        m1.update("a".encode("utf8"))
        m1.update("b".encode("utf8"))
        m1.update("c".encode("utf8"))
        forest = self._setup()
        result = forest.query(m1, 3)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)
        self.assertTrue("c" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, forest.query, m3, 1)
예제 #12
0
    def _get_minhash_from_domain(domain):
        """Get the Minhash value from a domain name.

        This function takes a domain, removes the TLD extension
        from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash
예제 #13
0
    def _get_minhash_from_domain(domain):
        """Get the Minhash value from a domain name.

        This function takes a domain, removes the TLD extension
        from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash
예제 #14
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
예제 #15
0
def _run_minhash(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=2**p)
    m2 = MinHash(num_perm=2**p)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return _minhash_inclusion(m1, m2)
예제 #16
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #17
0
 def __init__(self, malware, num_pcap, filename=None):
     """
     :param filename: reads file dumped with dump_data (default: None)
     """
     self.num_pcap = num_pcap
     self.malware = malware
     self.session_collection = {}
     self.malware_feature_list = list()
     if filename is not None:
         dump_file = open(filename, 'r')
         data = dump_file.read()
         data = json.loads(data)
         for key in data:
             hash_val = np.asarray(data[key][5], dtype=np.uint64)
             data[key][_MINHASH_INDEX] = MinHash()
             data[key][_MINHASH_INDEX] = hash_val
예제 #18
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
예제 #19
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
예제 #20
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")
예제 #21
0
    def test_query_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            result = lsh.query(m1)
            self.assertTrue("a" in result)
            result = lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.query, m3)
예제 #22
0
 def test_add_index(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     self.assertTrue(forest.is_empty())
     for t in forest.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in forest)
     self.assertTrue("b" in forest)
     for i, H in enumerate(forest.keys["a"]):
         self.assertTrue("a" in forest.hashtables[i][H])
     m3 = MinHash(18)
     self.assertRaises(ValueError, forest.add, "c", m3)
     forest.index()
     self.assertFalse(forest.is_empty())
예제 #23
0
 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])
예제 #24
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
예제 #25
0
    async def test_remove_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]
        keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')
        keys_left = frozenset(seq) - frozenset(keys_to_remove)

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            async with lsh.delete_session(batch_size=3) as session:
                fs = (session.remove(key) for key in keys_to_remove)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                for key in keys_to_remove:
                    self.assertTrue(key not in items, '{0} in items, but should not be'.format(key))
                for key in keys_left:
                    self.assertTrue(key in items, '{0} not in items, but should be'.format(key))

            for key in keys_to_remove:
                self.assertTrue(not (await lsh.has_key(key)), '<{0}> key should not be in LSH index'.format(key))
            for key in keys_left:
                self.assertTrue(await lsh.has_key(key), '<{0}> key should be in LSH index'.format(key))
예제 #26
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        
        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
예제 #27
0
 async def test_insertion_session_redis(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                threshold=0.5,
                                num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         data = [("a", m1), ("b", m2)]
         async with lsh.insertion_session() as session:
             for key, minhash in data:
                 await session.insert(key, minhash)
         for t in lsh.hashtables:
             self.assertTrue(await t.size() >= 1)
             items = []
             for H in await t.keys():
                 items.extend(await t.get(H))
             self.assertTrue(pickle.dumps("a") in items)
             self.assertTrue(pickle.dumps("b") in items)
         self.assertTrue(await lsh.has_key("a"))
         self.assertTrue(await lsh.has_key("b"))
         for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
             res = await lsh.hashtables[i].get(H)
             self.assertTrue(pickle.dumps("a") in res)
예제 #28
0
    async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk(
            (random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(
            chain((''.join(s) for s in _chunked_str),
                  ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                   'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False)
                      for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
예제 #29
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
예제 #30
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("a".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            await lsh.insert("a1", m3)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key('a1'))
            hashtable_correct = False
            for table in lsh.hashtables:
                for H in await table.keys():
                    table_vals = await table.get(H)
                    self.assertGreater(len(table_vals), 0)
                    self.assertTrue("a" not in table_vals)
                    if 'a1' in table_vals:
                        hashtable_correct = True
            self.assertTrue(hashtable_correct, 'Hashtable broken')

            with self.assertRaises(ValueError):
                await lsh.remove("c")
예제 #31
0
    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("b".encode("utf8"))
            fs = (lsh.insert("a", m1, check_duplication=False),
                  lsh.insert("b", m2, check_duplication=False),
                  lsh.insert("b", m3, check_duplication=False))
            await asyncio.gather(*fs)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.query(m3)
예제 #32
0
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)

if __name__ == "__main__":
    eg1()
    eg2()

>>> 
>>> from datasketch import MinHash

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

m1, m2 = MinHash(), MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
>>> 
>>> m = MinHash(num_perm=256)
>>> m.count()
0.0
>>> from sparselsh import LSH
예제 #33
0
from datasketch.lsh import MinHashLSH
from preprocess import tokenize_sentence
"""
To find similar questions in O(1) we are using jaccard similarity and minHash. 
Question with similar minHash are candidates to be similar. 
To compare if two candidate senteces are similar we are using jaccard similarity  
"""

df = pd.read_csv("proccessed.csv")
total_questions = df.shape[0]
threshold_jacard = 0.30
lsh = MinHashLSH(threshold=threshold_jacard)

#calculate minhash for each sentence in column question1
for index, row in df.iterrows():
    min_Hash = MinHash()
    question = tokenize_sentence(str(row['question1']))
    for word in question:
        min_Hash.update(word.encode('utf8'))
    lsh.insert(str(index), min_Hash)

total = 0
return_result = 0
correct = 0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))
    for word in question:
        question_minHash.update(word.encode('utf8'))
예제 #34
0
def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p, hashobj=Hash)
    for d in data:
        m.update(hasher(d, seed=seed))
    return m.count()
예제 #35
0
" they were al anr oo the bark of the bark of the boos of the boos of the boos of the boos",
" afd the nererland thet thet whre hev back on the barkn of the bors.",
" and they were al anr oo the bark of the bark of the boos of the boos of the boos of the boos afd the nererland thet",
" thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark of",
" the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the ",
" barkn of the bors. and they were al anr oo the bark of the bark of the boos of the boos of the boos of the",
" boos afd the nererland thet thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark ",
" of the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the barkn of the bors."
]

array=[0]*len(seq)
for i in range (len (dataX)):   
   for j  in range(len(seq)):
        data1=dataX[i].split()
        data2=seq[j].split()
        m1 = MinHash()
        m2 = MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
        if array[j]<actual_jaccard:
            array[j]=actual_jaccard
            print array[j]


print array 
예제 #36
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
예제 #37
0
def mh(dd):
    m = MinHash()
    for d in dd:
        m.update(d.encode('utf8'))
    return m
예제 #38
0
def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p)
    for d in data:
        m.digest(Hash(hasher(d, seed=seed)))
    return m.count()
예제 #39
0
 def min_hash_text(self, sm_text):
     m = MinHash()
     for d in sm_text:
         m.update(d.encode('utf8'))
     return m