예제 #1
0
    async def test_insert_redis(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
                res = await lsh.hashtables[i].get(H)
                self.assertTrue(pickle.dumps("a") in res)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
예제 #2
0
    async def test_insert_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            seq = [
                'aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                'ppi', 'eer'
            ]
            objs = [MinHash(16) for _ in range(len(seq))]
            for e, obj in zip(seq, objs):
                for i in e:
                    obj.update(i.encode('utf-8'))

            data = [(e, m) for e, m in zip(seq, objs)]
            for key, minhash in data:
                await lsh.insert(key, minhash)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahh' in items)
                self.assertTrue('bhg' in items)
            self.assertTrue(await lsh.has_key('aahh'))
            self.assertTrue(await lsh.has_key('bhg'))
            for i, H in enumerate(await lsh.keys.get('aahhb')):
                self.assertTrue('aahhb' in await lsh.hashtables[i].get(H))

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
예제 #3
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("a".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            await lsh.insert("a1", m3)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key('a1'))
            hashtable_correct = False
            for table in lsh.hashtables:
                for H in await table.keys():
                    table_vals = await table.get(H)
                    self.assertGreater(len(table_vals), 0)
                    self.assertTrue("a" not in table_vals)
                    if 'a1' in table_vals:
                        hashtable_correct = True
            self.assertTrue(hashtable_correct, 'Hashtable broken')

            with self.assertRaises(ValueError):
                await lsh.remove("c")
예제 #4
0
    def test_insert_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(len(t) >= 1)
                items = []
                for H in t:
                    items.extend(t[H])
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue("a" in lsh)
            self.assertTrue("b" in lsh)
            for i, H in enumerate(lsh.keys[pickle.dumps("a")]):
                self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H])

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.insert, "c", m3)
예제 #5
0
def mh2(data1, data2):
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    return m1.jaccard(m2)
예제 #6
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm, hashobj=Hash)
    m2 = MinHash(num_perm=num_perm, hashobj=Hash)
    for i in xrange(a_start, a_end):
        m1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        m2.update(hasher(data[i], seed=seed))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
예제 #7
0
 def test_get_counts(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     counts = lsh.get_counts()
     self.assertEqual(len(counts), lsh.b)
     for table in counts:
         self.assertEqual(sum(table.values()), 2)
예제 #8
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
예제 #9
0
    def test_query(self):
        m1 = MinHash()
        m1.update("a".encode("utf8"))
        m1.update("b".encode("utf8"))
        m1.update("c".encode("utf8"))
        forest = self._setup()
        result = forest.query(m1, 3)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)
        self.assertTrue("c" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, forest.query, m3, 1)
예제 #10
0
 async def test_get_counts_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.5, num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         await lsh.insert("a", m1)
         await lsh.insert("b", m2)
         counts = await lsh.get_counts()
         self.assertEqual(len(counts), lsh.b)
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
예제 #11
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest2.query(m1, 1)
     self.assertTrue("a" in result)
     result = forest2.query(m2, 1)
     self.assertTrue("b" in result)
예제 #12
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #13
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
예제 #14
0
 def _data(self, count):
     sizes = np.random.randint(1, 100, count)
     for key, size in enumerate(sizes):
         m = MinHash()
         for i in range(size):
             m.update(("%d" % i).encode("utf8"))
         yield (key, m, size)
예제 #15
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
            await lsh2.close()
예제 #16
0
    async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
예제 #17
0
    def read_pcap(self, filename):
        """
        Reads PCAP files using scapy's rdpcap, divides data into sessions and creates a hash for each session, a hash
        can be updated if sessions exist in subsequent files.

        :param filename: PCAP file path
        """
        packets = rdpcap(filename)[IP]
        sessions = packets.sessions()
        for key in sessions:
            # try:
            if key not in self.session_collection:
                parts = key.split()
                protocol = parts[0]
                ip1_parts = parts[1].split(':')
                ip1 = ip1_parts[0]
                port1 = int(ip1_parts[1]) if len(ip1_parts) > 1 else 0
                ip2_parts = parts[3].split(':')
                ip2 = ip2_parts[0]
                port2 = int(ip2_parts[1]) if len(ip1_parts) > 1 else 0
                entropy = entropy_domain_names(sessions[key])
                self.session_collection[key] = [
                    hash(protocol),
                    entropy,
                    # int(ipaddress.ip_address(ip1)),
                    port1,
                    # int(ipaddress.ip_address(ip2)),
                    port2,
                    MinHash()
                ]
            self.session_collection[key] = calculate_hash(
                self.session_collection[key], sessions[key])
예제 #18
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
예제 #19
0
파일: main.py 프로젝트: gosom/go-minhash
def main():
    with open('plato1.txt', 'r') as f:
        tokens1 = [l for l in f]
    with open('plato2.txt', 'r') as f:
        tokens2 = [l for l in f]

    start = time.time()
    m1 = MinHash(num_perm=64, seed=0)
    for t in tokens1:
        m1.update(t.encode('utf8'))

    m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations)
    for t in tokens2:
        m2.update(t.encode('utf8'))
    similarity = m2.jaccard(m1)
    elapsed = time.time() - start
    print("Similar %f and Took %f ms", similarity, elapsed * 1000)
예제 #20
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
예제 #21
0
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.update(v)
        s.add(v)
    return (m, s)
예제 #22
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.update(int_bytes(i))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
예제 #23
0
def _generate_minhash_list(data, shingle_length=2):
    minhash_list = []
    for text in data:
        m = MinHash()
        for d in _extract_shingles(text, shingle_length):
            m.update(d.encode('utf8'))
        minhash_list.append(m)

    return minhash_list
예제 #24
0
def prepare_domain(vals):
    permutations = config.MINHASH_PARAMS['num_permutations']
    encoding = config.MINHASH_PARAMS['encoding']
    m_set = MinHash(num_perm=permutations)

    for elem in vals:
        m_set.update(elem.encode(encoding))

    return m_set
예제 #25
0
def get_packet_seq_min_hash(packets, packet_range_begin, step):
    packet_seq_min_hash = MinHash()
    for packet_index in range(packet_range_begin, packet_range_begin + step):
        try:
            payload = get_payload(packets[packet_index])
        except IndexError:
            break
        packet_seq_min_hash.update(payload.encode('utf-8'))
    return packet_seq_min_hash.hashvalues
예제 #26
0
    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("b".encode("utf8"))
            fs = (lsh.insert("a", m1, check_duplication=False), lsh.insert("b", m2, check_duplication=False),
                  lsh.insert("b", m3, check_duplication=False))
            await asyncio.gather(*fs)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.query(m3)
예제 #27
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")
예제 #28
0
def minHashing(splitedString):
    shringleLength = 5
    startIndex = 0
    m1 = MinHash(num_perm=minHashPermmutations)

    for x in range(0, int(round(len(splitedString) / shringleLength))):
        m1.update(splitedString[startIndex:(startIndex +
                                            shringleLength)].encode('utf8'))
        startIndex = startIndex + shringleLength

    return m1.hashvalues
예제 #29
0
    async def test_remove_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk(
            (random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(
            chain((''.join(s) for s in _chunked_str),
                  ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                   'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]
        keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd',
                          'yow', 'ppi', 'eer')
        keys_left = frozenset(seq) - frozenset(keys_to_remove)

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False)
                      for key, minhash in data)
                await asyncio.gather(*fs)

            async with lsh.delete_session(batch_size=3) as session:
                fs = (session.remove(key) for key in keys_to_remove)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                for key in keys_to_remove:
                    self.assertTrue(
                        key not in items,
                        '{0} in items, but should not be'.format(key))
                for key in keys_left:
                    self.assertTrue(
                        key in items,
                        '{0} not in items, but should be'.format(key))

            for key in keys_to_remove:
                self.assertTrue(
                    not (await lsh.has_key(key)),
                    '<{0}> key should not be in LSH index'.format(key))
            for key in keys_left:
                self.assertTrue(await lsh.has_key(key),
                                '<{0}> key should be in LSH index'.format(key))
예제 #30
0
 def _setup(self):
     d = "abcdefghijklmnopqrstuvwxyz"
     forest = MinHashLSHForest()
     for i in range(len(d)-2):
         key = d[i]
         m = MinHash()
         j = i + 3
         for s in d[i:j]:
             m.update(s.encode("utf8"))
         forest.add(key, m)
     forest.index()
     return forest