Exemplo n.º 1
0
    async def test_insert_redis(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
                res = await lsh.hashtables[i].get(H)
                self.assertTrue(pickle.dumps("a") in res)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
Exemplo n.º 2
0
 def _data(self, count):
     sizes = np.random.randint(1, 100, count)
     for key, size in enumerate(sizes):
         m = MinHash()
         for i in range(size):
             m.update(("%d" % i).encode("utf8"))
         yield (key, m, size)
Exemplo n.º 3
0
    def test_insert_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(len(t) >= 1)
                items = []
                for H in t:
                    items.extend(t[H])
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue("a" in lsh)
            self.assertTrue("b" in lsh)
            for i, H in enumerate(lsh.keys[pickle.dumps("a")]):
                self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H])

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.insert, "c", m3)
Exemplo n.º 4
0
def mh2(data1, data2):
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    return m1.jaccard(m2)
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.update(v)
        s.add(v)
    return (m, s)
Exemplo n.º 6
0
def _generate_minhash_list(data, shingle_length=2):
    minhash_list = []
    for text in data:
        m = MinHash()
        for d in _extract_shingles(text, shingle_length):
            m.update(d.encode('utf8'))
        minhash_list.append(m)

    return minhash_list
Exemplo n.º 7
0
def prepare_domain(vals):
    permutations = config.MINHASH_PARAMS['num_permutations']
    encoding = config.MINHASH_PARAMS['encoding']
    m_set = MinHash(num_perm=permutations)

    for elem in vals:
        m_set.update(elem.encode(encoding))

    return m_set
Exemplo n.º 8
0
def get_packet_seq_min_hash(packets, packet_range_begin, step):
    packet_seq_min_hash = MinHash()
    for packet_index in range(packet_range_begin, packet_range_begin + step):
        try:
            payload = get_payload(packets[packet_index])
        except IndexError:
            break
        packet_seq_min_hash.update(payload.encode('utf-8'))
    return packet_seq_min_hash.hashvalues
Exemplo n.º 9
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.update(int_bytes(i))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
Exemplo n.º 10
0
def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.update(v)
        s.add(v)
    return (m, s)
Exemplo n.º 11
0
def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.update(int_bytes(i))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
Exemplo n.º 12
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm, hashobj=Hash)
    m2 = MinHash(num_perm=num_perm, hashobj=Hash)
    for i in xrange(a_start, a_end):
        m1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        m2.update(hasher(data[i], seed=seed))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
Exemplo n.º 13
0
def minHashing(splitedString):
    shringleLength = 5
    startIndex = 0
    m1 = MinHash(num_perm=minHashPermmutations)

    for x in range(0, int(round(len(splitedString) / shringleLength))):
        m1.update(splitedString[startIndex:(startIndex +
                                            shringleLength)].encode('utf8'))
        startIndex = startIndex + shringleLength

    return m1.hashvalues
Exemplo n.º 14
0
 def _setup(self):
     d = "abcdefghijklmnopqrstuvwxyz"
     forest = MinHashLSHForest()
     for i in range(len(d)-2):
         key = d[i]
         m = MinHash()
         j = i + 3
         for s in d[i:j]:
             m.update(s.encode("utf8"))
         forest.add(key, m)
     forest.index()
     return forest
Exemplo n.º 15
0
 def test_get_counts(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     counts = lsh.get_counts()
     self.assertEqual(len(counts), lsh.b)
     for table in counts:
         self.assertEqual(sum(table.values()), 2)
Exemplo n.º 16
0
 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     for l in range(2, 128+1, 16):
         forest = MinHashLSHForest(num_perm=128, l=l)
         m = MinHash()
         m.update("abcdefg".encode("utf8"))
         m.update("1234567".encode("utf8"))
         forest.add("m", m)
         sizes = [len(H) for ht in forest.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))
Exemplo n.º 17
0
    def _minhash_from_text(self, text):
        """Calculate minhash of text.

        Args:
            text: String to calculate minhash of.

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        minhash = MinHash(self._config.num_perm)
        for word in self._shingles_from_text(text):
            minhash.update(word.encode('utf8'))
        return minhash
Exemplo n.º 18
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Exemplo n.º 19
0
 async def test_get_counts_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.5, num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         await lsh.insert("a", m1)
         await lsh.insert("b", m2)
         counts = await lsh.get_counts()
         self.assertEqual(len(counts), lsh.b)
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
Exemplo n.º 20
0
    def test_query(self):
        m1 = MinHash()
        m1.update("a".encode("utf8"))
        m1.update("b".encode("utf8"))
        m1.update("c".encode("utf8"))
        forest = self._setup()
        result = forest.query(m1, 3)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)
        self.assertTrue("c" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, forest.query, m3, 1)
Exemplo n.º 21
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Exemplo n.º 22
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest2.query(m1, 1)
     self.assertTrue("a" in result)
     result = forest2.query(m2, 1)
     self.assertTrue("b" in result)
Exemplo n.º 23
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
Exemplo n.º 24
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)
Exemplo n.º 25
0
def get_hash_values(atomic_table, vectorization_type="simple"):
    table_vector = []
    if vectorization_type == "simple":
        table_vector = vectorize_atomic_table(atomic_table)
    elif vectorization_type == "lemmatize":
        table_vector = lemmatize_atomic_table(atomic_table)
    elif vectorization_type == "categorize":
        table_vector = categorize_atomic_table(atomic_table)
    _hash = MinHash()
    for item in table_vector:
        if type(item) == str:
            item = item.encode()
        _hash.update(item)
    return _hash.hashvalues
Exemplo n.º 26
0
def prepare_query(filename):
    file = open(filename)
    first_line = file.readlines()[0]
    vals_string = first_line.split('\t')[1]
    vals = vals_string.split(',')

    length = len(vals)
    permutations = config.MINHASH_PARAMS['num_permutations']
    encoding = config.MINHASH_PARAMS['encoding']
    m_query = MinHash(num_perm=permutations)

    for elem in vals:
        m_query.update(elem.encode(encoding))
    query_set = [m_query, length]
    return query_set
Exemplo n.º 27
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
Exemplo n.º 28
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
Exemplo n.º 29
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
            await lsh2.close()
Exemplo n.º 30
0
def minhash_from_text(text, num_perm, delimiters):
    """Calculate minhash of text.

    Args:
        text: string to calculate minhash of.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        delimiters: list of strings used as delimiters for splitting text
            into words.

    Returns:
        A minhash (instance of datasketch.minhash.MinHash)
    """
    minhash = MinHash(num_perm)
    for word in _shingles_from_text(text, delimiters):
        minhash.update(word.encode('utf8'))
    return minhash
Exemplo n.º 31
0
 async def test__H_redis(self):
     """
     Check _H output consistent bytes length given
     the same concatenated hash value size
     """
     for _ in range(2, 128 + 1, 16):
         m = MinHash()
         m.update("abcdefg".encode("utf8"))
         m.update("1234567".encode("utf8"))
         async with AsyncMinHashLSH(
                 storage_config=self._storage_config_redis,
                 num_perm=128) as lsh:
             await lsh.insert("m", m)
             sizes = [
                 len(H) for ht in lsh.hashtables for H in await ht.keys()
             ]
             self.assertTrue(all(sizes[0] == s for s in sizes))
Exemplo n.º 32
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Exemplo n.º 33
0
    def _create_min_hashes(self):
        print_now('Start creating min hashes')
        min_hashes = []
        for (event_id, _, stacktrace) in self.data:
            if stacktrace is None: continue

            l_set = set(stacktrace.lower().replace(',', ' ').split())
            m = MinHash(num_perm=NUM_PERM)
            for d in l_set:
                m.update(d.encode('utf8'))
            min_hashes.append((event_id, m))

        lsh = MinHashLSH(threshold=0.5, num_perm=NUM_PERM)
        for event_id, m in min_hashes:
            lsh.insert(event_id, m)

        return (min_hashes, lsh)
Exemplo n.º 34
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        
        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Exemplo n.º 35
0
def main():
    with open('plato1.txt', 'r') as f:
        tokens1 = [l for l in f]
    with open('plato2.txt', 'r') as f:
        tokens2 = [l for l in f]

    start = time.time()
    m1 = MinHash(num_perm=64, seed=0)
    for t in tokens1:
        m1.update(t.encode('utf8'))

    m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations)
    for t in tokens2:
        m2.update(t.encode('utf8'))
    similarity = m2.jaccard(m1)
    elapsed = time.time() - start
    print("Similar %f and Took %f ms", similarity, elapsed * 1000)
Exemplo n.º 36
0
def minhash_from_text(text, num_perm, delimiters):
    """Calculate minhash of text.

    Args:
        text: string to calculate minhash of.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        delimiters: list of strings used as delimiters for splitting text
            into words.

    Returns:
        A minhash (instance of datasketch.minhash.MinHash)
    """
    minhash = MinHash(num_perm)
    for word in _shingles_from_text(text, delimiters):
        minhash.update(word.encode("utf8"))
    return minhash
Exemplo n.º 37
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Exemplo n.º 38
0
    async def test_arbitrary_collection(self):
        self._storage_config_mongo["mongo"][
            "collection_name"] = "unit_test_collection"
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update(b"a")
            await lsh.insert("a", m1)

        dsn = MONGO_URL or "mongodb://{host}:{port}/{db}".format(
            **self._storage_config_mongo["mongo"])
        collection = AsyncIOMotorClient(dsn).get_default_database(
            "lsh_test").get_collection("unit_test_collection")
        count = await collection.count_documents({})

        self.assertGreaterEqual(count, 1)
        del self._storage_config_mongo["mongo"]["collection_name"]
Exemplo n.º 39
0
    def _get_minhash_from_domain(domain):
        """Get the Minhash value from a domain name.

        This function takes a domain, removes the TLD extension
        from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash
Exemplo n.º 40
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
Exemplo n.º 41
0
" thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark of",
" the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the ",
" barkn of the bors. and they were al anr oo the bark of the bark of the boos of the boos of the boos of the",
" boos afd the nererland thet thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark ",
" of the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the barkn of the bors."
]

array=[0]*len(seq)
for i in range (len (dataX)):   
   for j  in range(len(seq)):
        data1=dataX[i].split()
        data2=seq[j].split()
        m1 = MinHash()
        m2 = MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
        if array[j]<actual_jaccard:
            array[j]=actual_jaccard
            print array[j]


print array 
#m1 = MinHash()/*
#print dataX[1],len(seq)
#for i in range(len(seq)):
Exemplo n.º 42
0
 def min_hash_text(self, sm_text):
     m = MinHash()
     for d in sm_text:
         m.update(d.encode('utf8'))
     return m