class EsDataClassification(object): def __init__(self, es_index, es_mapping, field, query): # Dataset info self.es_index = es_index self.es_mapping = es_mapping self.field = field # Build ES manager self.es_m = ES_Manager(es_index, es_mapping) self.es_m.load_combined_query(query) def get_total_documents(self): return self.es_m.get_total_documents() def get_tags_by_id(self, doc_id): request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id) response = ES_Manager.plain_get(request_url) if 'texta_tags' in response['_source']: tags = response['_source']['texta_tags'] else: tags = "" return tags.split() def apply_classifiers(self, classifiers, classifier_tags): if not isinstance(classifiers, list): classifiers = [classifiers] if not isinstance(classifier_tags, list): classifier_tags = [classifier_tags] response = self.es_m.scroll() scroll_id = response['_scroll_id'] total_hits = response['hits']['total'] total_processed = 0 positive_docs = [] positive_docs_batch = [] batch_size = 1000 # Get all positive documents while total_hits > 0: # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg = 'Elasticsearch failed to retrieve documents: ' \ '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'], response['timed_out'], response['took']) raise EsIteratorError(msg) for hit in response['hits']['hits']: positive_docs_batch.append(((str(hit['_id'])), hit['_source'])) if len(positive_docs_batch) >= batch_size: positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags) positive_docs_batch = [] total_processed += len(positive_docs_batch) # New scroll request response = self.es_m.scroll(scroll_id=scroll_id) total_hits = len(response['hits']['hits']) if positive_docs_batch: positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags) total_processed += len(positive_docs_batch) data = {} data['total_processed'] = total_processed data['total_positive'] = positive_docs_per_classifier[0] if len(classifiers) == 1 else positive_docs_per_classifier if len(classifiers) == 1: data['total_negative'] = total_processed - positive_docs_per_classifier[0] else: data['total_negative'] = [ total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier ] data['total_documents'] = self.get_total_documents() return data def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags): """ :param documents: list of (doc_id, document) entries :return: None """ field_path_components = self.field.split('.') fields_data = [] for document in documents: # Traverse the nested fields to reach the sought input text/data for the classifier field_data = document[1] for field_path_component in field_path_components: field_data = field_data[field_path_component] fields_data.append(field_data) positive_docs = [] classifiers_predictions = [] for classifier in classifiers: predictions = classifier.predict(fields_data) classifiers_predictions.append(predictions) positive_docs.append(sum(predictions)) bulk_update_content = [] for document_idx, document in enumerate(documents): document_id, document = document if 'texta_tags' in document: tags = set([tag.strip() for tag in document['texta_tags'].split('\n')]) else: tags = set() new_tags = False for classifier_idx, classifier_predictions in enumerate(classifiers_predictions): if classifier_predictions[document_idx] == 1: tag_count_before = len(tags) tags.add(classifier_tags[classifier_idx]) new_tags = len(tags) > tag_count_before if new_tags: bulk_update_content.append(json.dumps({ 'update': { '_id': document_id, '_index': self.es_index, '_type': self.es_mapping } })) bulk_update_content.append(json.dumps({ 'doc': { 'texta_tags': '\n'.join(sorted(tags)) } })) bulk_update_content.append('') bulk_update_content = '\n'.join(bulk_update_content) self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content) return positive_docs
class FactManager: """ Manage Searcher facts, like deleting/storing, adding facts. """ def __init__(self, request): self.es_params = request.POST self.ds = Datasets().activate_dataset(request.session) self.index = self.ds.get_index() self.mapping = self.ds.get_mapping() self.es_m = ES_Manager(self.index, self.mapping) self.field = 'texta_facts' def remove_facts_from_document(self, rm_facts_dict, bs=7500): '''remove a certain fact from all documents given a [str]key and [str]val''' logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS') try: # Clears readonly block just in case the index has been set to read only self.es_m.clear_readonly_block() query = self._fact_deletion_query(rm_facts_dict) self.es_m.load_combined_query(query) response = self.es_m.scroll(size=bs, field_scroll=self.field) scroll_id = response['_scroll_id'] total_docs = response['hits']['total'] docs_left = total_docs # DEBUG print('Starting.. Total docs - ', total_docs) # DEBUG batch = 0 while total_docs > 0: print('Docs left:', docs_left) # DEBUG data = '' for document in response['hits']['hits']: new_field = [] # The new facts field for fact in document['_source'][self.field]: # If the fact name is in rm_facts_dict keys if fact["fact"] in rm_facts_dict: # If the fact value is not in the delete key values if fact['str_val'] not in rm_facts_dict.getlist( fact["fact"]): new_field.append(fact) else: new_field.append(fact) # Update dataset data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: new_field}} data += json.dumps(document) + '\n' response = self.es_m.scroll(scroll_id=scroll_id, size=bs, field_scroll=self.field) total_docs = len(response['hits']['hits']) docs_left -= bs # DEBUG scroll_id = response['_scroll_id'] self.es_m.plain_post_bulk(self.es_m.es_url, data) print('DONE') # DEBUG logger.set_context('docs_left', total_docs) logger.set_context('batch', batch) logger.info('remove_facts_from_document') except: print(traceback.format_exc()) logger.set_context('es_params', self.es_params) logger.exception('remove_facts_from_document_failed') def tag_documents_with_fact(self, es_params, tag_name, tag_value, tag_field): '''Used to tag all documents in the current search with a certain fact''' self.es_m.build(es_params) self.es_m.load_combined_query(self.es_m.combined_query) response = self.es_m.scroll() data = '' for document in response['hits']['hits']: if 'mlp' in tag_field: split_field = tag_field.split('.') span = [ 0, len(document['_source'][split_field[0]][split_field[1]]) ] else: span = [0, len(document['_source'][tag_field].strip())] document['_source'][self.field].append({ "str_val": tag_value, "spans": str([span]), "fact": tag_name, "doc_path": tag_field }) data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: document['_source'][self.field]}} data += json.dumps(document) + '\n' self.es_m.plain_post_bulk(self.es_m.es_url, data) response = requests.post( '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format( self.es_m.es_url, self.index), headers=self.es_m.HEADERS) def count_cooccurrences(self, fact_pairs): """Finds the counts of cooccuring facts Arguments: fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))] Returns: [int list] -- Occurances of the given facts """ queries = [] for fact_pair in fact_pairs: fact_constraints = [] for fact in fact_pair: constraint = { "nested": { "path": "texta_facts", "query": { "bool": { "must": [{ "term": { "texta_facts.fact": fact[0] } }, { "term": { "texta_facts.str_val": fact[1] } }] } } } } fact_constraints.append(constraint) query = {"query": {"bool": {"must": fact_constraints}}, "size": 0} queries.append(json.dumps(query)) header = json.dumps({"index": self.index}) data = "\n".join(["{0}\n{1}".format(header, q) for q in queries]) + "\n" responses = requests.post("{0}/{1}/_msearch".format( self.es_m.es_url, self.index), data=data, headers={"Content-Type": "application/json"}) counts = [ response["hits"]["total"] for response in responses.json()['responses'] ] return counts def facts_via_aggregation(self, size=15): """Finds all facts from current search. Parameters: size - [int=15] -- Amount of fact values per fact name to search in query Returns: facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}} fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))] unique_fact_names - [list of string] -- All unique fact names """ aggs = { "facts": { "nested": { "path": "texta_facts" }, "aggs": { "fact_names": { "terms": { "field": "texta_facts.fact" }, "aggs": { "fact_values": { "terms": { "field": "texta_facts.str_val", "size": size } } } } } } } self.es_m.build(self.es_params) self.es_m.set_query_parameter('aggs', aggs) response = self.es_m.search() response_aggs = response['aggregations']['facts']['fact_names'][ 'buckets'] facts = {} fact_combinations = [] fact_count = 0 unique_fact_names = [] for bucket in response_aggs: unique_fact_names.append(bucket['key']) for fact in bucket['fact_values']['buckets']: facts[bucket['key'] + " - " + fact['key']] = { 'id': fact_count, 'name': bucket['key'], 'value': fact['key'], 'doc_count': fact['doc_count'] } fact_combinations.append((bucket['key'], fact['key'])) fact_count += 1 fact_combinations = [ x for x in itertools.combinations(fact_combinations, 2) ] return (facts, fact_combinations, unique_fact_names) def fact_graph(self, search_size): facts, fact_combinations, unique_fact_names = self.facts_via_aggregation( size=search_size) # Get cooccurrences and remove values with 0 fact_combinations = { k: v for k, v in dict( zip(fact_combinations, self.count_cooccurrences(fact_combinations))).items() if v != 0 } shapes = [ "circle", "cross", "diamond", "square", "triangle-down", "triangle-up" ] types = dict(zip(unique_fact_names, itertools.cycle(shapes))) nodes = [] for i, fact in enumerate(facts): nodes.append({ "source": facts[fact]['id'], "size": facts[fact]['doc_count'], "score": facts[fact]['doc_count'], "name": facts[fact]['name'], "id": facts[fact]['value'], "type": types[facts[fact]['name']] }) # Track max/min count count = facts[fact]['doc_count'] if i == 0: max_node_size = count min_node_size = count max_node_size = max(max_node_size, count) min_node_size = min(min_node_size, count) links = [] max_link_size = 0 for fact in fact_combinations.keys(): max_link_size = max(max_link_size, fact_combinations[fact]) links.append({ "source": facts[fact[0][0] + " - " + fact[0][1]]['id'], "target": facts[fact[1][0] + " - " + fact[1][1]]['id'], "count": fact_combinations[fact] }) graph_data = json.dumps({"nodes": nodes, "links": links}) return (graph_data, unique_fact_names, max_node_size, max_link_size, min_node_size) def _fact_deletion_query(self, rm_facts_dict): '''Creates the query for fact deletion based on dict of facts {name: val}''' fact_queries = [] for key in rm_facts_dict: for val in rm_facts_dict.getlist(key): fact_queries.append({ "bool": { "must": [{ "match": { self.field + ".fact": key } }, { "match": { self.field + ".str_val": val } }] } }) query = { "main": { "query": { "nested": { "path": self.field, "query": { "bool": { "should": fact_queries } } } }, "_source": [self.field] } } return query
class EsDataClassification(object): def __init__(self, es_index, es_mapping, field, query): # Dataset info self.es_index = es_index self.es_mapping = es_mapping self.field = field # Build ES manager self.es_m = ES_Manager(es_index, es_mapping) self.es_m.load_combined_query(query) def get_total_documents(self): return self.es_m.get_total_documents() def get_tags_by_id(self, doc_id): request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id) response = ES_Manager.plain_get(request_url) if 'texta_tags' in response['_source']: tags = response['_source']['texta_tags'] else: tags = "" return tags.split() def apply_classifiers(self, classifiers, classifier_tags): if not isinstance(classifiers, list): classifiers = [classifiers] if not isinstance(classifier_tags, list): classifier_tags = [classifier_tags] response = self.es_m.scroll() scroll_id = response['_scroll_id'] total_hits = response['hits']['total'] total_processed = 0 positive_docs = [] positive_docs_batch = [] batch_size = 1000 # Get all positive documents while total_hits > 0: # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg = 'Elasticsearch failed to retrieve documents: ' \ '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'], response['timed_out'], response['took']) raise EsIteratorError(msg) for hit in response['hits']['hits']: positive_docs_batch.append(((str(hit['_id'])), hit['_source'])) if len(positive_docs_batch) >= batch_size: positive_docs_per_classifier = self._apply_classifiers_to_documents( positive_docs_batch, classifiers, classifier_tags) positive_docs_batch = [] total_processed += len(positive_docs_batch) # New scroll request response = self.es_m.scroll(scroll_id=scroll_id) total_hits = len(response['hits']['hits']) if positive_docs_batch: positive_docs_per_classifier = self._apply_classifiers_to_documents( positive_docs_batch, classifiers, classifier_tags) total_processed += len(positive_docs_batch) data = {} data['total_processed'] = total_processed data['total_positive'] = positive_docs_per_classifier[0] if len( classifiers) == 1 else positive_docs_per_classifier if len(classifiers) == 1: data[ 'total_negative'] = total_processed - positive_docs_per_classifier[ 0] else: data['total_negative'] = [ total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier ] data['total_documents'] = self.get_total_documents() return data def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags): """ :param documents: list of (doc_id, document) entries :return: None """ field_path_components = self.field.split('.') fields_data = [] for document in documents: # Traverse the nested fields to reach the sought input text/data for the classifier field_data = document[1] for field_path_component in field_path_components: field_data = field_data[field_path_component] fields_data.append(field_data) positive_docs = [] classifiers_predictions = [] for classifier in classifiers: predictions = classifier.predict(fields_data) classifiers_predictions.append(predictions) positive_docs.append(sum(predictions)) bulk_update_content = [] for document_idx, document in enumerate(documents): document_id, document = document if 'texta_tags' in document: tags = set([ tag.strip() for tag in document['texta_tags'].split('\n') ]) else: tags = set() new_tags = False for classifier_idx, classifier_predictions in enumerate( classifiers_predictions): if classifier_predictions[document_idx] == 1: tag_count_before = len(tags) tags.add(classifier_tags[classifier_idx]) new_tags = len(tags) > tag_count_before if new_tags: bulk_update_content.append( json.dumps({ 'update': { '_id': document_id, '_index': self.es_index, '_type': self.es_mapping } })) bulk_update_content.append( json.dumps( {'doc': { 'texta_tags': '\n'.join(sorted(tags)) }})) bulk_update_content.append('') bulk_update_content = '\n'.join(bulk_update_content) self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content) return positive_docs