def _new_lsh_index(self): """Create a new LSH from a set of Timesketch events. Returns: A tuple with an LSH (instance of datasketch.lsh.LSH) and a dictionary with event ID as key and minhash as value. """ minhashes = {} lsh = MinHashLSH(self._config.threshold, self._config.num_perm) # Event generator for streaming Elasticsearch results. events = self._datastore.search_stream( query_string=self._config.query, query_filter={}, indices=[self._config.index], return_fields=[self._config.field]) with lsh.insertion_session() as lsh_session: for event in events: event_id = event['_id'] index_name = event['_index'] event_type = event['_type'] event_text = event['_source'][self._config.field] # Insert minhash in LSH index key = (event_id, event_type, index_name) minhash = self._minhash_from_text(event_text) minhashes[key] = minhash lsh_session.insert(key, minhash) return lsh, minhashes
def new_lsh_index(events, field, delimiters=None, num_perm=None, threshold=None): """Create a new LSH from a set of Timesketch events. Args: events: list or an iterator of Event objects. field: string denoting the event field to use for the LSH. delimiters: list of strings used as delimiters for splitting text into words. num_perm: number of random permutation functions used by MinHash to be indexed. threshold: a float for the Jaccard similarity threshold between 0.0 and 1.0. The initialized MinHash LSH will be optimized for the threshold by minizing the false positive and false negative. Returns: A tuple with an LSH (instance of datasketch.lsh.LSH) and a dictionary with event ID as key and minhash as value. """ if delimiters is None: delimiters = DEFAULT_DELIMITERS if num_perm is None: num_perm = DEFAULT_PERMUTATIONS if threshold is None: threshold = DEFAULT_THRESHOLD minhashes = {} lsh = MinHashLSH(threshold, num_perm) with lsh.insertion_session() as lsh_session: for event in events: # Insert minhash in LSH index. key = (event.event_id, event.event_type, event.index_name) minhash = minhash_from_text(event.source[field], num_perm, delimiters) minhashes[key] = minhash lsh_session.insert(key, minhash) return lsh, minhashes
def test_insertion_session(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) data = [("a", m1), ("b", m2)] with lsh.insertion_session() as session: for key, minhash in data: session.insert(key, minhash) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys["a"]): self.assertTrue("a" in lsh.hashtables[i][H])
def new_lsh_index(events, field, delimiters=None, num_perm=None, threshold=None): """Create a new LSH from a set of Timesketch events. Args: events: list or an iterator of Event objects. field: string denoting the event field to use for the LSH. delimiters: list of strings used as delimiters for splitting text into words. num_perm: number of random permutation functions used by MinHash to be indexed. threshold: a float for the Jaccard similarity threshold between 0.0 and 1.0. The initialized MinHash LSH will be optimized for the threshold by minizing the false positive and false negative. Returns: A tuple with an LSH (instance of datasketch.lsh.LSH) and a dictionary with event ID as key and minhash as value. """ if delimiters is None: delimiters = DEFAULT_DELIMITERS if num_perm is None: num_perm = DEFAULT_PERMUTATIONS if threshold is None: threshold = DEFAULT_THRESHOLD minhashes = {} lsh = MinHashLSH(threshold, num_perm) with lsh.insertion_session() as lsh_session: for event in events: # Insert minhash in LSH index. key = (event.event_id, event.event_type, event.index_name) minhash = minhash_from_text( event.source[field], num_perm, delimiters) minhashes[key] = minhash lsh_session.insert(key, minhash) return lsh, minhashes