Exemplo n.º 1
0
class EsDataClassification(object):

    def __init__(self, es_index, es_mapping, field, query):
        # Dataset info
        self.es_index = es_index
        self.es_mapping = es_mapping
        self.field = field
        # Build ES manager
        self.es_m = ES_Manager(es_index, es_mapping)
        self.es_m.load_combined_query(query)

    def get_total_documents(self):
        return self.es_m.get_total_documents()

    def get_tags_by_id(self, doc_id):
        request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id)
        response = ES_Manager.plain_get(request_url)
        if 'texta_tags' in response['_source']:
            tags = response['_source']['texta_tags']
        else:
            tags = ""
        return tags.split()

    def apply_classifiers(self, classifiers, classifier_tags):
        if not isinstance(classifiers, list):
            classifiers = [classifiers]

        if not isinstance(classifier_tags, list):
            classifier_tags = [classifier_tags]

        response = self.es_m.scroll()
        scroll_id = response['_scroll_id']
        total_hits = response['hits']['total']
        total_processed = 0
        positive_docs = []
        positive_docs_batch = []
        batch_size = 1000

        # Get all positive documents
        while total_hits > 0:

            # Check errors in the database request
            if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']:
                msg = 'Elasticsearch failed to retrieve documents: ' \
                      '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'],
                                                                              response['timed_out'], response['took'])
                raise EsIteratorError(msg)

            for hit in response['hits']['hits']:
                positive_docs_batch.append(((str(hit['_id'])), hit['_source']))

                if len(positive_docs_batch) >= batch_size:
                    positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags)
                    positive_docs_batch = []
                    total_processed += len(positive_docs_batch)

            # New scroll request
            response = self.es_m.scroll(scroll_id=scroll_id)
            total_hits = len(response['hits']['hits'])

        if positive_docs_batch:
            positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags)
            total_processed += len(positive_docs_batch)

        data = {}
        data['total_processed'] = total_processed
        data['total_positive'] = positive_docs_per_classifier[0] if len(classifiers) == 1 else positive_docs_per_classifier
        if len(classifiers) == 1:
            data['total_negative'] = total_processed - positive_docs_per_classifier[0]
        else:
            data['total_negative'] = [
                total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier
            ]
        data['total_documents'] = self.get_total_documents()

        return data

    def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags):
        """
        :param documents: list of (doc_id, document) entries
        :return: None
        """
        field_path_components = self.field.split('.')
        fields_data = []

        for document in documents:
            # Traverse the nested fields to reach the sought input text/data for the classifier
            field_data = document[1]
            for field_path_component in field_path_components:
                field_data = field_data[field_path_component]
            fields_data.append(field_data)

        positive_docs = []
        classifiers_predictions = []

        for classifier in classifiers:
            predictions = classifier.predict(fields_data)
            classifiers_predictions.append(predictions)
            positive_docs.append(sum(predictions))

        bulk_update_content = []
        for document_idx, document in enumerate(documents):
            document_id, document = document
            if 'texta_tags' in document:
                tags = set([tag.strip() for tag in document['texta_tags'].split('\n')])
            else:
                tags = set()

            new_tags = False
            for classifier_idx, classifier_predictions in enumerate(classifiers_predictions):
                if classifier_predictions[document_idx] == 1:
                    tag_count_before = len(tags)
                    tags.add(classifier_tags[classifier_idx])
                    new_tags = len(tags) > tag_count_before

            if new_tags:
                bulk_update_content.append(json.dumps({
                    'update': {
                        '_id':    document_id,
                        '_index': self.es_index,
                        '_type':  self.es_mapping
                    }
                }))
                bulk_update_content.append(json.dumps({
                    'doc': {
                        'texta_tags': '\n'.join(sorted(tags))
                    }
                }))

        bulk_update_content.append('')
        bulk_update_content = '\n'.join(bulk_update_content)

        self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content)

        return positive_docs
Exemplo n.º 2
0
class FactManager:
    """ Manage Searcher facts, like deleting/storing, adding facts.
    """
    def __init__(self, request):
        self.es_params = request.POST
        self.ds = Datasets().activate_dataset(request.session)
        self.index = self.ds.get_index()
        self.mapping = self.ds.get_mapping()
        self.es_m = ES_Manager(self.index, self.mapping)
        self.field = 'texta_facts'

    def remove_facts_from_document(self, rm_facts_dict, bs=7500):
        '''remove a certain fact from all documents given a [str]key and [str]val'''
        logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS')

        try:
            # Clears readonly block just in case the index has been set to read only
            self.es_m.clear_readonly_block()

            query = self._fact_deletion_query(rm_facts_dict)
            self.es_m.load_combined_query(query)
            response = self.es_m.scroll(size=bs, field_scroll=self.field)
            scroll_id = response['_scroll_id']
            total_docs = response['hits']['total']
            docs_left = total_docs  # DEBUG
            print('Starting.. Total docs - ', total_docs)  # DEBUG
            batch = 0
            while total_docs > 0:
                print('Docs left:', docs_left)  # DEBUG
                data = ''
                for document in response['hits']['hits']:
                    new_field = []  # The new facts field
                    for fact in document['_source'][self.field]:
                        # If the fact name is in rm_facts_dict keys
                        if fact["fact"] in rm_facts_dict:
                            # If the fact value is not in the delete key values
                            if fact['str_val'] not in rm_facts_dict.getlist(
                                    fact["fact"]):
                                new_field.append(fact)
                        else:
                            new_field.append(fact)
                    # Update dataset
                    data += json.dumps({
                        "update": {
                            "_id": document['_id'],
                            "_type": document['_type'],
                            "_index": document['_index']
                        }
                    }) + '\n'
                    document = {'doc': {self.field: new_field}}
                    data += json.dumps(document) + '\n'
                response = self.es_m.scroll(scroll_id=scroll_id,
                                            size=bs,
                                            field_scroll=self.field)
                total_docs = len(response['hits']['hits'])
                docs_left -= bs  # DEBUG
                scroll_id = response['_scroll_id']
                self.es_m.plain_post_bulk(self.es_m.es_url, data)
            print('DONE')  # DEBUG

            logger.set_context('docs_left', total_docs)
            logger.set_context('batch', batch)
            logger.info('remove_facts_from_document')
        except:
            print(traceback.format_exc())
            logger.set_context('es_params', self.es_params)
            logger.exception('remove_facts_from_document_failed')

    def tag_documents_with_fact(self, es_params, tag_name, tag_value,
                                tag_field):
        '''Used to tag all documents in the current search with a certain fact'''

        self.es_m.build(es_params)
        self.es_m.load_combined_query(self.es_m.combined_query)

        response = self.es_m.scroll()

        data = ''
        for document in response['hits']['hits']:
            if 'mlp' in tag_field:
                split_field = tag_field.split('.')
                span = [
                    0,
                    len(document['_source'][split_field[0]][split_field[1]])
                ]
            else:
                span = [0, len(document['_source'][tag_field].strip())]
            document['_source'][self.field].append({
                "str_val": tag_value,
                "spans": str([span]),
                "fact": tag_name,
                "doc_path": tag_field
            })

            data += json.dumps({
                "update": {
                    "_id": document['_id'],
                    "_type": document['_type'],
                    "_index": document['_index']
                }
            }) + '\n'
            document = {'doc': {self.field: document['_source'][self.field]}}
            data += json.dumps(document) + '\n'
        self.es_m.plain_post_bulk(self.es_m.es_url, data)
        response = requests.post(
            '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format(
                self.es_m.es_url, self.index),
            headers=self.es_m.HEADERS)

    def count_cooccurrences(self, fact_pairs):
        """Finds the counts of cooccuring facts

        Arguments:
            fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))]

        Returns:
            [int list] -- Occurances of the given facts
        """
        queries = []
        for fact_pair in fact_pairs:
            fact_constraints = []

            for fact in fact_pair:
                constraint = {
                    "nested": {
                        "path": "texta_facts",
                        "query": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "texta_facts.fact": fact[0]
                                    }
                                }, {
                                    "term": {
                                        "texta_facts.str_val": fact[1]
                                    }
                                }]
                            }
                        }
                    }
                }
                fact_constraints.append(constraint)

            query = {"query": {"bool": {"must": fact_constraints}}, "size": 0}
            queries.append(json.dumps(query))

        header = json.dumps({"index": self.index})
        data = "\n".join(["{0}\n{1}".format(header, q)
                          for q in queries]) + "\n"

        responses = requests.post("{0}/{1}/_msearch".format(
            self.es_m.es_url, self.index),
                                  data=data,
                                  headers={"Content-Type": "application/json"})
        counts = [
            response["hits"]["total"]
            for response in responses.json()['responses']
        ]

        return counts

    def facts_via_aggregation(self, size=15):
        """Finds all facts from current search.
        Parameters:
            size - [int=15] -- Amount of fact values per fact name to search in query
        Returns:
            facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}}
            fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))]
            unique_fact_names - [list of string] -- All unique fact names
        """

        aggs = {
            "facts": {
                "nested": {
                    "path": "texta_facts"
                },
                "aggs": {
                    "fact_names": {
                        "terms": {
                            "field": "texta_facts.fact"
                        },
                        "aggs": {
                            "fact_values": {
                                "terms": {
                                    "field": "texta_facts.str_val",
                                    "size": size
                                }
                            }
                        }
                    }
                }
            }
        }
        self.es_m.build(self.es_params)
        self.es_m.set_query_parameter('aggs', aggs)

        response = self.es_m.search()

        response_aggs = response['aggregations']['facts']['fact_names'][
            'buckets']

        facts = {}
        fact_combinations = []
        fact_count = 0
        unique_fact_names = []
        for bucket in response_aggs:
            unique_fact_names.append(bucket['key'])
            for fact in bucket['fact_values']['buckets']:
                facts[bucket['key'] + " - " + fact['key']] = {
                    'id': fact_count,
                    'name': bucket['key'],
                    'value': fact['key'],
                    'doc_count': fact['doc_count']
                }
                fact_combinations.append((bucket['key'], fact['key']))
                fact_count += 1

        fact_combinations = [
            x for x in itertools.combinations(fact_combinations, 2)
        ]
        return (facts, fact_combinations, unique_fact_names)

    def fact_graph(self, search_size):
        facts, fact_combinations, unique_fact_names = self.facts_via_aggregation(
            size=search_size)
        # Get cooccurrences and remove values with 0
        fact_combinations = {
            k: v
            for k, v in dict(
                zip(fact_combinations,
                    self.count_cooccurrences(fact_combinations))).items()
            if v != 0
        }
        shapes = [
            "circle", "cross", "diamond", "square", "triangle-down",
            "triangle-up"
        ]
        types = dict(zip(unique_fact_names, itertools.cycle(shapes)))

        nodes = []
        for i, fact in enumerate(facts):
            nodes.append({
                "source": facts[fact]['id'],
                "size": facts[fact]['doc_count'],
                "score": facts[fact]['doc_count'],
                "name": facts[fact]['name'],
                "id": facts[fact]['value'],
                "type": types[facts[fact]['name']]
            })
            # Track max/min count
            count = facts[fact]['doc_count']
            if i == 0:
                max_node_size = count
                min_node_size = count
            max_node_size = max(max_node_size, count)
            min_node_size = min(min_node_size, count)

        links = []
        max_link_size = 0
        for fact in fact_combinations.keys():
            max_link_size = max(max_link_size, fact_combinations[fact])
            links.append({
                "source": facts[fact[0][0] + " - " + fact[0][1]]['id'],
                "target": facts[fact[1][0] + " - " + fact[1][1]]['id'],
                "count": fact_combinations[fact]
            })

        graph_data = json.dumps({"nodes": nodes, "links": links})
        return (graph_data, unique_fact_names, max_node_size, max_link_size,
                min_node_size)

    def _fact_deletion_query(self, rm_facts_dict):
        '''Creates the query for fact deletion based on dict of facts {name: val}'''
        fact_queries = []
        for key in rm_facts_dict:
            for val in rm_facts_dict.getlist(key):
                fact_queries.append({
                    "bool": {
                        "must": [{
                            "match": {
                                self.field + ".fact": key
                            }
                        }, {
                            "match": {
                                self.field + ".str_val": val
                            }
                        }]
                    }
                })

        query = {
            "main": {
                "query": {
                    "nested": {
                        "path": self.field,
                        "query": {
                            "bool": {
                                "should": fact_queries
                            }
                        }
                    }
                },
                "_source": [self.field]
            }
        }

        return query
Exemplo n.º 3
0
class EsDataClassification(object):
    def __init__(self, es_index, es_mapping, field, query):
        # Dataset info
        self.es_index = es_index
        self.es_mapping = es_mapping
        self.field = field
        # Build ES manager
        self.es_m = ES_Manager(es_index, es_mapping)
        self.es_m.load_combined_query(query)

    def get_total_documents(self):
        return self.es_m.get_total_documents()

    def get_tags_by_id(self, doc_id):
        request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index,
                                               self.es_mapping, doc_id)
        response = ES_Manager.plain_get(request_url)
        if 'texta_tags' in response['_source']:
            tags = response['_source']['texta_tags']
        else:
            tags = ""
        return tags.split()

    def apply_classifiers(self, classifiers, classifier_tags):
        if not isinstance(classifiers, list):
            classifiers = [classifiers]

        if not isinstance(classifier_tags, list):
            classifier_tags = [classifier_tags]

        response = self.es_m.scroll()
        scroll_id = response['_scroll_id']
        total_hits = response['hits']['total']
        total_processed = 0
        positive_docs = []
        positive_docs_batch = []
        batch_size = 1000

        # Get all positive documents
        while total_hits > 0:

            # Check errors in the database request
            if (response['_shards']['total'] > 0
                    and response['_shards']['successful']
                    == 0) or response['timed_out']:
                msg = 'Elasticsearch failed to retrieve documents: ' \
                      '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'],
                                                                              response['timed_out'], response['took'])
                raise EsIteratorError(msg)

            for hit in response['hits']['hits']:
                positive_docs_batch.append(((str(hit['_id'])), hit['_source']))

                if len(positive_docs_batch) >= batch_size:
                    positive_docs_per_classifier = self._apply_classifiers_to_documents(
                        positive_docs_batch, classifiers, classifier_tags)
                    positive_docs_batch = []
                    total_processed += len(positive_docs_batch)

            # New scroll request
            response = self.es_m.scroll(scroll_id=scroll_id)
            total_hits = len(response['hits']['hits'])

        if positive_docs_batch:
            positive_docs_per_classifier = self._apply_classifiers_to_documents(
                positive_docs_batch, classifiers, classifier_tags)
            total_processed += len(positive_docs_batch)

        data = {}
        data['total_processed'] = total_processed
        data['total_positive'] = positive_docs_per_classifier[0] if len(
            classifiers) == 1 else positive_docs_per_classifier
        if len(classifiers) == 1:
            data[
                'total_negative'] = total_processed - positive_docs_per_classifier[
                    0]
        else:
            data['total_negative'] = [
                total_processed - positive_docs_count
                for positive_docs_count in positive_docs_per_classifier
            ]
        data['total_documents'] = self.get_total_documents()

        return data

    def _apply_classifiers_to_documents(self, documents, classifiers,
                                        classifier_tags):
        """
        :param documents: list of (doc_id, document) entries
        :return: None
        """
        field_path_components = self.field.split('.')
        fields_data = []

        for document in documents:
            # Traverse the nested fields to reach the sought input text/data for the classifier
            field_data = document[1]
            for field_path_component in field_path_components:
                field_data = field_data[field_path_component]
            fields_data.append(field_data)

        positive_docs = []
        classifiers_predictions = []

        for classifier in classifiers:
            predictions = classifier.predict(fields_data)
            classifiers_predictions.append(predictions)
            positive_docs.append(sum(predictions))

        bulk_update_content = []
        for document_idx, document in enumerate(documents):
            document_id, document = document
            if 'texta_tags' in document:
                tags = set([
                    tag.strip() for tag in document['texta_tags'].split('\n')
                ])
            else:
                tags = set()

            new_tags = False
            for classifier_idx, classifier_predictions in enumerate(
                    classifiers_predictions):
                if classifier_predictions[document_idx] == 1:
                    tag_count_before = len(tags)
                    tags.add(classifier_tags[classifier_idx])
                    new_tags = len(tags) > tag_count_before

            if new_tags:
                bulk_update_content.append(
                    json.dumps({
                        'update': {
                            '_id': document_id,
                            '_index': self.es_index,
                            '_type': self.es_mapping
                        }
                    }))
                bulk_update_content.append(
                    json.dumps(
                        {'doc': {
                            'texta_tags': '\n'.join(sorted(tags))
                        }}))

        bulk_update_content.append('')
        bulk_update_content = '\n'.join(bulk_update_content)

        self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content)

        return positive_docs