Exemplo n.º 1
0
 def test_init(self):
     lsh = MinHashLSH(threshold=0.8)
     self.assertTrue(lsh.is_empty())
     b1, r1 = lsh.b, lsh.r
     lsh = MinHashLSH(threshold=0.8, weights=(0.2, 0.8))
     b2, r2 = lsh.b, lsh.r
     self.assertTrue(b1 < b2)
     self.assertTrue(r1 > r2)
Exemplo n.º 2
0
    def test_insert_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(len(t) >= 1)
                items = []
                for H in t:
                    items.extend(t[H])
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue("a" in lsh)
            self.assertTrue("b" in lsh)
            for i, H in enumerate(lsh.keys[pickle.dumps("a")]):
                self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H])

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.insert, "c", m3)
Exemplo n.º 3
0
 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5)):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     self.indexes = [
         dict((r, MinHashLSH(num_perm=self.h, params=(int(self.h / r), r)))
              for r in rs) for _ in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]
Exemplo n.º 4
0
    def _new_lsh_index(self):
        """Create a new LSH from a set of Timesketch events.

        Returns:
            A tuple with an LSH (instance of datasketch.lsh.LSH) and a
            dictionary with event ID as key and minhash as value.
        """
        minhashes = {}
        lsh = MinHashLSH(self._config.threshold, self._config.num_perm)

        # Event generator for streaming Elasticsearch results.
        events = self._datastore.search_stream(
            query_string=self._config.query,
            query_filter={},
            indices=[self._config.index],
            return_fields=[self._config.field])

        with lsh.insertion_session() as lsh_session:
            for event in events:
                event_id = event['_id']
                index_name = event['_index']
                event_type = event['_type']
                event_text = event['_source'][self._config.field]

                # Insert minhash in LSH index
                key = (event_id, event_type, index_name)
                minhash = self._minhash_from_text(event_text)
                minhashes[key] = minhash
                lsh_session.insert(key, minhash)

        return lsh, minhashes
Exemplo n.º 5
0
 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size + 1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         lsh = MinHashLSH(num_perm=128)
         lsh.insert("m", m)
         sizes = [len(H) for ht in lsh.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))
Exemplo n.º 6
0
    def test_pickle(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)
Exemplo n.º 7
0
 def test_get_counts(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     counts = lsh.get_counts()
     self.assertEqual(len(counts), lsh.b)
     for table in counts:
         self.assertEqual(sum(table.values()), 2)
Exemplo n.º 8
0
 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     for l in range(2, 128 + 1, 16):
         lsh = MinHashLSH(num_perm=128)
         m = MinHash()
         m.update("abcdefg".encode("utf8"))
         m.update("1234567".encode("utf8"))
         lsh.insert("m", m)
         sizes = [len(H) for ht in lsh.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))
Exemplo n.º 9
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Exemplo n.º 10
0
def eg2():
    mg = WeightedMinHashGenerator(10, 5)
    m1 = mg.minhash(v1)
    m2 = mg.minhash(v2)
    m3 = mg.minhash(v3)
    print("Estimated Jaccard m1, m2", m1.jaccard(m2))
    print("Estimated Jaccard m1, m3", m1.jaccard(m3))
    # Create LSH index
    lsh = MinHashLSH(threshold=0.1, num_perm=5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1",
          result)
Exemplo n.º 11
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
Exemplo n.º 12
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Exemplo n.º 13
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Exemplo n.º 14
0
    def _create_min_hashes(self):
        print_now('Start creating min hashes')
        min_hashes = []
        for (event_id, _, stacktrace) in self.data:
            if stacktrace is None: continue

            l_set = set(stacktrace.lower().replace(',', ' ').split())
            m = MinHash(num_perm=NUM_PERM)
            for d in l_set:
                m.update(d.encode('utf8'))
            min_hashes.append((event_id, m))

        lsh = MinHashLSH(threshold=0.5, num_perm=NUM_PERM)
        for event_id, m in min_hashes:
            lsh.insert(event_id, m)

        return (min_hashes, lsh)
Exemplo n.º 15
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
Exemplo n.º 16
0
def new_lsh_index(events,
                  field,
                  delimiters=None,
                  num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(event.source[field], num_perm,
                                        delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes
Exemplo n.º 17
0
 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])
Exemplo n.º 18
0
 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5),
              storage_config=None,
              prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     storage_config = {
         'type': 'dict'
     } if not storage_config else storage_config
     basename = storage_config.get('basename', _random_name(11))
     self.indexes = [
         dict((r,
               MinHashLSH(num_perm=self.h,
                          params=(int(self.h / r), r),
                          storage_config=self._get_storage_config(
                              basename, storage_config, partition, r),
                          prepickle=prepickle)) for r in rs)
         for partition in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]
Exemplo n.º 19
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Exemplo n.º 20
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Exemplo n.º 21
0
    def test_query_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            result = lsh.query(m1)
            self.assertTrue("a" in result)
            result = lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.query, m3)
Exemplo n.º 22
0
import pandas as pd
from datasketch.minhash import MinHash
from datasketch.lsh import MinHashLSH
from preprocess import tokenize_sentence
"""
To find similar questions in O(1) we are using jaccard similarity and minHash. 
Question with similar minHash are candidates to be similar. 
To compare if two candidate senteces are similar we are using jaccard similarity  
"""

df = pd.read_csv("proccessed.csv")
total_questions = df.shape[0]
threshold_jacard = 0.30
lsh = MinHashLSH(threshold=threshold_jacard)

#calculate minhash for each sentence in column question1
for index, row in df.iterrows():
    min_Hash = MinHash()
    question = tokenize_sentence(str(row['question1']))
    for word in question:
        min_Hash.update(word.encode('utf8'))
    lsh.insert(str(index), min_Hash)

total = 0
return_result = 0
correct = 0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))