def run(self): """Entry point for the SimilarityScorer. Returns: A dict with metadata about the processed data set or None if no data_types has been configured. """ # Exit early if there is no data_type to process. if not self._config: return # Event generator for streaming results. events = self.event_stream(query_string=self._config.query, return_fields=[self._config.field]) lsh, minhashes = similarity.new_lsh_index( events, field=self._config.field, delimiters=self._config.delimiters, num_perm=self._config.num_perm, threshold=self._config.threshold) total_num_events = len(minhashes) for key, minhash in minhashes.items(): event_id, event_type, index_name = key event_dict = dict(_id=event_id, _type=event_type, _index=index_name) event = interface.Event(event_dict, self.datastore) score = similarity.calculate_score(lsh, minhash, total_num_events) attributes_to_add = {'similarity_score': score} event.add_attributes(attributes_to_add) return dict(index=self._config.index_name, data_type=self._config.data_type, num_events_processed=total_num_events)
def run(self): """Entry point for the SimilarityScorer. Returns: A dict with metadata about the processed data set or None if no data_types has been configured. """ if not self._config: return 'No data_type specified.' # Event generator for streaming results. events = self.event_stream(query_string=self._config.query, return_fields=[self._config.field]) lsh, minhashes = similarity.new_lsh_index( events, field=self._config.field, delimiters=self._config.delimiters, num_perm=self._config.num_perm, threshold=self._config.threshold) total_num_events = len(minhashes) for key, minhash in minhashes.items(): event_id, event_type, index_name = key event_dict = dict(_id=event_id, _type=event_type, _index=index_name) event = interface.Event(event_dict, self.datastore) score = similarity.calculate_score(lsh, minhash, total_num_events) attributes_to_add = {'similarity_score': score} event.add_attributes(attributes_to_add) # Commit the event to the datastore. event.commit() msg = 'Similarity scorer processed {0:d} events for data_type {1:s}' return msg.format(total_num_events, self._config.data_type)
def run(self): """Entry point for the SimilarityScorer. Returns: A dict with metadata about the processed data set or None if no data_types has been configured. """ # Exit early if there is no data_type to process. if not self._config: return None # Event generator for streaming results. events = self.event_stream( query_string=self._config.query, return_fields=[self._config.field] ) lsh, minhashes = similarity.new_lsh_index( events, field=self._config.field, delimiters=self._config.delimiters, num_perm=self._config.num_perm, threshold=self._config.threshold) total_num_events = len(minhashes) for key, minhash in minhashes.items(): event_id, event_type, index_name = key event_dict = dict(_id=event_id, _type=event_type, _index=index_name) event = interface.Event(event_dict, self.datastore) score = similarity.calculate_score(lsh, minhash, total_num_events) attributes_to_add = {'similarity_score': score} event.add_attributes(attributes_to_add) # Commit the event to the datastore. event.commit() msg = 'Similarity scorer processed {0:d} events for data_type {1:s}' return msg.format(total_num_events, self._config.data_type)