Exemplo n.º 1
0
    def run(self):
        """Entry point for the SimilarityScorer.

        Returns:
            A dict with metadata about the processed data set or None if no
            data_types has been configured.
        """
        # Exit early if there is no data_type to process.
        if not self._config:
            return

        # Event generator for streaming results.
        events = self.event_stream(query_string=self._config.query,
                                   return_fields=[self._config.field])

        lsh, minhashes = similarity.new_lsh_index(
            events,
            field=self._config.field,
            delimiters=self._config.delimiters,
            num_perm=self._config.num_perm,
            threshold=self._config.threshold)
        total_num_events = len(minhashes)
        for key, minhash in minhashes.items():
            event_id, event_type, index_name = key
            event_dict = dict(_id=event_id,
                              _type=event_type,
                              _index=index_name)
            event = interface.Event(event_dict, self.datastore)
            score = similarity.calculate_score(lsh, minhash, total_num_events)
            attributes_to_add = {'similarity_score': score}
            event.add_attributes(attributes_to_add)

        return dict(index=self._config.index_name,
                    data_type=self._config.data_type,
                    num_events_processed=total_num_events)
Exemplo n.º 2
0
    def run(self):
        """Entry point for the SimilarityScorer.

        Returns:
            A dict with metadata about the processed data set or None if no
            data_types has been configured.
        """
        if not self._config:
            return 'No data_type specified.'

        # Event generator for streaming results.
        events = self.event_stream(query_string=self._config.query,
                                   return_fields=[self._config.field])

        lsh, minhashes = similarity.new_lsh_index(
            events,
            field=self._config.field,
            delimiters=self._config.delimiters,
            num_perm=self._config.num_perm,
            threshold=self._config.threshold)
        total_num_events = len(minhashes)
        for key, minhash in minhashes.items():
            event_id, event_type, index_name = key
            event_dict = dict(_id=event_id,
                              _type=event_type,
                              _index=index_name)
            event = interface.Event(event_dict, self.datastore)
            score = similarity.calculate_score(lsh, minhash, total_num_events)
            attributes_to_add = {'similarity_score': score}
            event.add_attributes(attributes_to_add)
            # Commit the event to the datastore.
            event.commit()

        msg = 'Similarity scorer processed {0:d} events for data_type {1:s}'
        return msg.format(total_num_events, self._config.data_type)
Exemplo n.º 3
0
    def run(self):
        """Entry point for the SimilarityScorer.

        Returns:
            A dict with metadata about the processed data set or None if no
            data_types has been configured.
        """
        # Exit early if there is no data_type to process.
        if not self._config:
            return None

        # Event generator for streaming results.
        events = self.event_stream(
            query_string=self._config.query,
            return_fields=[self._config.field]
        )

        lsh, minhashes = similarity.new_lsh_index(
            events, field=self._config.field,
            delimiters=self._config.delimiters, num_perm=self._config.num_perm,
            threshold=self._config.threshold)
        total_num_events = len(minhashes)
        for key, minhash in minhashes.items():
            event_id, event_type, index_name = key
            event_dict = dict(_id=event_id, _type=event_type, _index=index_name)
            event = interface.Event(event_dict, self.datastore)
            score = similarity.calculate_score(lsh, minhash, total_num_events)
            attributes_to_add = {'similarity_score': score}
            event.add_attributes(attributes_to_add)
            # Commit the event to the datastore.
            event.commit()

        msg = 'Similarity scorer processed {0:d} events for data_type {1:s}'
        return msg.format(total_num_events, self._config.data_type)