Exemplos de MinHashLSH.MinHashLSH em Python, exemplos de datasketch.lsh.MinHashLSH.MinHashLSH em Python

Exemplo n.º 1

0

Exibir arquivo

 def test_init(self):
     lsh = MinHashLSH(threshold=0.8)
     self.assertTrue(lsh.is_empty())
     b1, r1 = lsh.b, lsh.r
     lsh = MinHashLSH(threshold=0.8, weights=(0.2, 0.8))
     b2, r2 = lsh.b, lsh.r
     self.assertTrue(b1 < b2)
     self.assertTrue(r1 > r2)

Exemplo n.º 2

0

Exibir arquivo

    def test_insert_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(len(t) >= 1)
                items = []
                for H in t:
                    items.extend(t[H])
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue("a" in lsh)
            self.assertTrue("b" in lsh)
            for i, H in enumerate(lsh.keys[pickle.dumps("a")]):
                self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H])

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.insert, "c", m3)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: lshensemble.py Projeto: yyht/datasketch

 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5)):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     self.indexes = [
         dict((r, MinHashLSH(num_perm=self.h, params=(int(self.h / r), r)))
              for r in rs) for _ in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]

Exemplo n.º 4

0

Exibir arquivo

    def _new_lsh_index(self):
        """Create a new LSH from a set of Timesketch events.

        Returns:
            A tuple with an LSH (instance of datasketch.lsh.LSH) and a
            dictionary with event ID as key and minhash as value.
        """
        minhashes = {}
        lsh = MinHashLSH(self._config.threshold, self._config.num_perm)

        # Event generator for streaming Elasticsearch results.
        events = self._datastore.search_stream(
            query_string=self._config.query,
            query_filter={},
            indices=[self._config.index],
            return_fields=[self._config.field])

        with lsh.insertion_session() as lsh_session:
            for event in events:
                event_id = event['_id']
                index_name = event['_index']
                event_type = event['_type']
                event_text = event['_source'][self._config.field]

                # Insert minhash in LSH index
                key = (event_id, event_type, index_name)
                minhash = self._minhash_from_text(event_text)
                minhashes[key] = minhash
                lsh_session.insert(key, minhash)

        return lsh, minhashes

Exemplo n.º 5

0

Exibir arquivo

 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size + 1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         lsh = MinHashLSH(num_perm=128)
         lsh.insert("m", m)
         sizes = [len(H) for ht in lsh.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: lsh_test.py Projeto: zhangjunqiang/datasketch

    def test_pickle(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

Exemplo n.º 7

0

Exibir arquivo

 def test_get_counts(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     counts = lsh.get_counts()
     self.assertEqual(len(counts), lsh.b)
     for table in counts:
         self.assertEqual(sum(table.values()), 2)

Exemplo n.º 8

0

Exibir arquivo

 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     for l in range(2, 128 + 1, 16):
         lsh = MinHashLSH(num_perm=128)
         m = MinHash()
         m.update("abcdefg".encode("utf8"))
         m.update("1234567".encode("utf8"))
         lsh.insert("m", m)
         sizes = [len(H) for ht in lsh.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))

Exemplo n.º 9

0

Exibir arquivo

 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: lsh_examples.py Projeto: zjiaksmc/datasketch

def eg2():
    mg = WeightedMinHashGenerator(10, 5)
    m1 = mg.minhash(v1)
    m2 = mg.minhash(v2)
    m3 = mg.minhash(v3)
    print("Estimated Jaccard m1, m2", m1.jaccard(m2))
    print("Estimated Jaccard m1, m3", m1.jaccard(m3))
    # Create LSH index
    lsh = MinHashLSH(threshold=0.1, num_perm=5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1",
          result)

Exemplo n.º 11

0

Exibir arquivo

    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)

Exemplo n.º 12

0

Exibir arquivo

    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")

Exemplo n.º 13

0

Exibir arquivo

    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")

Exemplo n.º 14

0

Exibir arquivo

    def _create_min_hashes(self):
        print_now('Start creating min hashes')
        min_hashes = []
        for (event_id, _, stacktrace) in self.data:
            if stacktrace is None: continue

            l_set = set(stacktrace.lower().replace(',', ' ').split())
            m = MinHash(num_perm=NUM_PERM)
            for d in l_set:
                m.update(d.encode('utf8'))
            min_hashes.append((event_id, m))

        lsh = MinHashLSH(threshold=0.5, num_perm=NUM_PERM)
        for event_id, m in min_hashes:
            lsh.insert(event_id, m)

        return (min_hashes, lsh)

Exemplo n.º 15

0

Exibir arquivo

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: similarity.py Projeto: jaegeral/timesketch

def new_lsh_index(events,
                  field,
                  delimiters=None,
                  num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(event.source[field], num_perm,
                                        delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes

Exemplo n.º 17

0

Exibir arquivo

 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])

Exemplo n.º 18

0

Exibir arquivo

Arquivo: lshensemble.py Projeto: galt2x/datasketch-1

 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5),
              storage_config=None,
              prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     storage_config = {
         'type': 'dict'
     } if not storage_config else storage_config
     basename = storage_config.get('basename', _random_name(11))
     self.indexes = [
         dict((r,
               MinHashLSH(num_perm=self.h,
                          params=(int(self.h / r), r),
                          storage_config=self._get_storage_config(
                              basename, storage_config, partition, r),
                          prepickle=prepickle)) for r in rs)
         for partition in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]

Exemplo n.º 19

0

Exibir arquivo

    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.insert, "c", m3)

Exemplo n.º 20

0

Exibir arquivo

    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)

Exemplo n.º 21

0

Exibir arquivo

    def test_query_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            result = lsh.query(m1)
            self.assertTrue("a" in result)
            result = lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.query, m3)

Exemplo n.º 22

0

Exibir arquivo

import pandas as pd
from datasketch.minhash import MinHash
from datasketch.lsh import MinHashLSH
from preprocess import tokenize_sentence
"""
To find similar questions in O(1) we are using jaccard similarity and minHash. 
Question with similar minHash are candidates to be similar. 
To compare if two candidate senteces are similar we are using jaccard similarity  
"""

df = pd.read_csv("proccessed.csv")
total_questions = df.shape[0]
threshold_jacard = 0.30
lsh = MinHashLSH(threshold=threshold_jacard)

#calculate minhash for each sentence in column question1
for index, row in df.iterrows():
    min_Hash = MinHash()
    question = tokenize_sentence(str(row['question1']))
    for word in question:
        min_Hash.update(word.encode('utf8'))
    lsh.insert(str(index), min_Hash)

total = 0
return_result = 0
correct = 0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))