def __init__(self, request): self.es_params = request.POST self.ds = Datasets().activate_dataset(request.session) self.index = self.ds.get_index() self.mapping = self.ds.get_mapping() self.es_m = ES_Manager(self.index, self.mapping) self.field = 'texta_facts'
def __init__(self, es_index, es_mapping, field, query): # Dataset info self.es_index = es_index self.es_mapping = es_mapping self.field = field # Build ES manager self.es_m = ES_Manager(es_index, es_mapping) self.es_m.load_combined_query(query)
def open_close_dataset(request): dataset_id = request.POST['dataset_id'] dataset = Dataset.objects.get(pk=dataset_id) if request.POST['open_close'] == 'open': ES_Manager.open_index(dataset.index) else: ES_Manager.close_index(dataset.index) return HttpResponse()
def parse_request(self, request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def delete_index(request): dataset_ids = request.POST.getlist('dataset_ids[]') for dataset_id in dataset_ids: index_to_delete = Dataset.objects.get(pk=dataset_id) content_type = ContentType.objects.get_for_model(Dataset) Permission.objects.get( codename='can_access_dataset_' + str(index_to_delete.id), content_type=content_type, ).delete() ES_Manager.delete_index(index_to_delete.index) index_to_delete.delete() return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
def index(request): template = loader.get_template('dataset_importer.html') jobs = DatasetImport.objects.all() archive_formats = collect_map_entries(extractor_map) single_document_formats = collect_map_entries(entity_reader_map) document_collection_formats = collect_map_entries(collection_reader_map) database_formats = collect_map_entries(database_reader_map) # preprocessors = collect_map_entries(preprocessor_map) # enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True] datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') analyzers = ES_Manager.get_analyzers() context = { # 'enabled_input_types': DATASET_IMPORTER_CONF['enabled_input_types'], 'archive_formats': archive_formats, 'single_document_formats': single_document_formats, 'document_collection_formats': document_collection_formats, 'database_formats': database_formats, 'language_models': language_models, 'allowed_datasets': datasets, 'jobs': jobs, 'analyzers': analyzers # 'enabled_preprocessors': enabled_preprocessors } return HttpResponse(template.render(context, request))
def index(request): template = loader.get_template('dataset_importer.html') jobs = DatasetImport.objects.all() archive_formats = collect_map_entries(extractor_map) single_document_formats = collect_map_entries(entity_reader_map) document_collection_formats = collect_map_entries(collection_reader_map) database_formats = collect_map_entries(database_reader_map) # preprocessors = collect_map_entries(preprocessor_map) # enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True] datasets = Datasets().get_allowed_datasets(request.user) language_models =Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk') analyzers = ES_Manager.get_analyzers() context = { # 'enabled_input_types': DATASET_IMPORTER_CONF['enabled_input_types'], 'archive_formats': archive_formats, 'single_document_formats': single_document_formats, 'document_collection_formats': document_collection_formats, 'database_formats': database_formats, 'language_models': language_models, 'allowed_datasets': datasets, 'jobs': jobs, 'analyzers': analyzers # 'enabled_preprocessors': enabled_preprocessors } return HttpResponse(template.render(context, request))
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size": 10, "highlight": { "fields": { field: {} } }, "query": { "match": { field: value } } }) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def index(request): indices = ES_Manager.get_indices() indices = sorted(indices, key=lambda x: x['index']) # sort alphabetically datasets = get_datasets(indices=indices) users = User.objects.all() users = annotate_users_with_permissions(users, datasets) template = loader.get_template('permission_admin.html') allowed_datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') return HttpResponse( template.render( { 'users': users, 'datasets': datasets, 'indices': indices, 'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'allowed_datasets': allowed_datasets, 'language_models': language_models }, request))
def add_dataset(request): daterange = "" dataset = Dataset(author=request.user, index=request.POST['index'], mapping=request.POST['mapping'], daterange=daterange, access=(request.POST['access'])) dataset.save() create_dataset_access_permission_and_propagate(dataset, request.POST['access']) indices = ES_Manager.get_indices() ds_out = dataset.__dict__ for index in indices: if index['index'] == ds_out['index']: ds_out['status'] = index['status'] ds_out['docs_count'] = index['docs_count'] ds_out['store_size'] = index['store_size'] break elif '*' in ds_out['index']: ds_out['status'] = 'open' ds_out['docs_count'] = 'multiindex' ds_out['store_size'] = 'multiindex' ds_out['_state'] = '' ds_out['author'] = request.user.get_username() return JsonResponse(ds_out)
def get_tags_by_id(self, doc_id): request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id) response = ES_Manager.plain_get(request_url) if 'texta_tags' in response['_source']: tags = response['_source']['texta_tags'] else: tags = "" return tags.split()
def delete_index(request): index_to_delete = request.POST['index'] index_name = Dataset.objects.get(pk=index_to_delete).index remove_dataset(index_to_delete) es_m = ES_Manager.delete_index(index_name) return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
def check_if_analyzer_exists(self): ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers() user_sent_analyzer = self.post_dict["analyzer"] available_analyzer_names = list( map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS)) if user_sent_analyzer not in available_analyzer_names: raise ValueError( "Analyzer '{0}' not available. Available analyzers are: '{1}'". format(user_sent_analyzer, available_analyzer_names))
def get_allowed_datasets(self, user): indices = ES_Manager.get_indices() datasets = self.sort_datasets(indices) #print(datasets) return [ dataset for dataset in datasets if user.has_perm('permission_admin.can_access_dataset_' + str(dataset['id'])) ]
def __init__(self, request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges, self.date_labels = self._get_date_intervals( self.daterange, interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results)
def parse_request(self,request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def index(request): indices = ES_Manager.get_indices() indices = sorted(indices, key=lambda x: x['index']) # sort alphabetically datasets = get_datasets(indices=indices) users = User.objects.all() users = annotate_users_with_permissions(users, datasets) template = loader.get_template('permission_admin.html') allowed_datasets = Datasets().get_allowed_datasets(request.user) language_models =Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk') return HttpResponse(template.render({'users':users,'datasets':datasets,'indices':indices,'STATIC_URL':STATIC_URL,'URL_PREFIX':URL_PREFIX, 'allowed_datasets': allowed_datasets, 'language_models': language_models},request))
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}}) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def __init__(self,request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results)
def more_like_this(request): if request.method == "POST": try: utf8_post_payload = json.loads(request.body.decode("utf-8")) except json.JSONDecodeError as e: return JsonResponse({"json": str(e)}, status=400) valid_request = ValidateFormSerializer(data=utf8_post_payload) if valid_request.is_valid(): post_data = valid_request.validated_data fields = [field for field in post_data["fields"]] size = post_data.get("size", 10) returned_fields = post_data.get("returned_fields", None) if_agg_only = post_data.get("if_agg_only", False) like = [] for document in post_data["like"]: dataset = Dataset.objects.get(pk=document["dataset_id"]) doc = {"_id": document["document_id"], "_index": dataset.index, "_type": dataset.mapping} like.append(doc) hits = ES_Manager.more_like_this( elastic_url=es_url, fields=fields, like=like, size=size, dataset=dataset, return_fields=returned_fields, filters=post_data.get("filters", []), aggregations=post_data.get("aggregations", []), include=post_data.get("include", False), if_agg_only=if_agg_only, ) return JsonResponse(hits, status=200) if "elasticsearch" not in hits else JsonResponse(hits, status=400) else: logging.getLogger(ERROR_LOGGER).error("Request: {}, Response: {}".format(request.POST, valid_request.errors)) return JsonResponse(valid_request.errors, status=400)
def more_like_this(request): if request.method == "POST": try: utf8_post_payload = json.loads(request.body.decode("utf-8")) except json.JSONDecodeError as e: return JsonResponse({"json": str(e)}, status=400) valid_request = ValidateFormSerializer(data=utf8_post_payload) if valid_request.is_valid(): post_data = valid_request.validated_data fields = ["{}.keyword".format(field) for field in post_data["fields"]] size = post_data.get("size", 10) returned_fields = post_data.get("returned_fields", None) if_agg_only = post_data.get("if_agg_only", False) like = [] for document in post_data["like"]: dataset = Dataset.objects.get(pk=document["dataset_id"]) doc = {"_id": document["document_id"], "_index": dataset.index, "_type": dataset.mapping} like.append(doc) hits = ES_Manager.more_like_this( elastic_url=es_url, fields=fields, like=like, size=size, dataset=dataset, return_fields=returned_fields, filters=post_data.get("filters", []), aggregations=post_data.get("aggregations", []), if_agg_only=if_agg_only, ) return JsonResponse(hits, status=200) if "elasticsearch" not in hits else JsonResponse(hits, status=400) else: logging.getLogger(ERROR_LOGGER).error("Request: {}, Response: {}".format(request.POST, valid_request.errors)) return JsonResponse(valid_request.errors, status=400)
def get_datasets(indices=None): datasets = Dataset.objects.all() datasets_out = [] for dataset in datasets: ds_out = dataset.__dict__ if indices: for index in indices: if index['index'] == ds_out['index']: ds_out['status'] = index['status'] ds_out['docs_count'] = ES_Manager.single_index_count( index["index"]) # Passed value from indices is wrong. ds_out['store_size'] = index['store_size'] elif '*' in ds_out['index']: ds_out['status'] = 'open' ds_out['docs_count'] = 'multiindex' ds_out['store_size'] = 'multiindex' datasets_out.append(ds_out) return datasets
def get_analyzer_names(request): ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers() analyzer_names = list(map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS)) return JsonResponse({"analyzers": analyzer_names})
def get_mappings(request): index = request.GET['index'] return HttpResponse(json.dumps(ES_Manager.get_mappings(index)))
def find_mappings(request): try: slop = int(request.POST['slop']) max_len = int(request.POST['max_len']) min_len = int(request.POST['min_len']) min_freq = int(request.POST['min_freq']) match_field = request.POST['match_field'] description = request.POST['description'] batch_size = 50 # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() lexicon = [] word_index = {} num_lexicons = 0 for i, lexicon_id in enumerate(request.POST.getlist('lexicons[]')): num_lexicons += 1 for word in Word.objects.filter(lexicon=lexicon_id): word = word.wrd lexicon.append(word) if word not in word_index: word_index[word] = [] word_index[word].append(i) lexicon = list(set(lexicon)) if min_len > num_lexicons: min_len = num_lexicons mwe_counter = 0 group_counter = 0 phrases = [] final = {} data = [] new_run = Run(minimum_frequency=min_freq, maximum_length=max_len, minimum_length=min_len, run_status='running', run_started=datetime.now(), run_completed=None, user=request.user, description=description) new_run.save() logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_started', 'args': { 'user_name': request.user.username, 'run_id': new_run.id, 'slop': slop, 'min_len': min_len, 'max_len': max_len, 'min_freq': min_freq, 'match_field': match_field, 'desc': description } })) for i in range(min_len, max_len + 1): print('Permutation len:', i) for permutation in itertools.permutations(lexicon, i): word_indices = list( flatten([word_index[word] for word in permutation])) if len(word_indices) == len(set(word_indices)): permutation = ' '.join(permutation) if slop > 0: query = { "query": { "match_phrase": { match_field: { "query": permutation, "slop": slop } } } } else: query = { "query": { "match_phrase": { match_field: { "query": permutation } } } } data.append( json.dumps({ "index": dataset, "mapping": mapping }) + '\n' + json.dumps(query)) phrases.append(permutation) if len(data) == batch_size: for j, response in enumerate( ES_Manager.plain_multisearch( es_url, dataset, mapping, data)): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join( sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase( sorted_phrase, request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = { 'total_freq': 0, 'mwes': [], 'display_name': { 'freq': 0, 'label': False }, 'id': group_counter } group_counter += 1 final[sorted_conceptualised_phrase][ 'total_freq'] += response['hits'][ 'total'] final[sorted_conceptualised_phrase][ 'mwes'].append({ 'mwe': phrases[j], 'freq': response['hits']['total'], 'accepted': False, 'id': mwe_counter }) mwe_counter += 1 final[sorted_conceptualised_phrase][ 'mwes'].sort(reverse=True, key=lambda k: k['freq']) if response['hits']['total'] > final[ sorted_conceptualised_phrase][ 'display_name']['freq']: final[sorted_conceptualised_phrase][ 'display_name']['freq'] = response[ 'hits']['total'] final[sorted_conceptualised_phrase][ 'display_name']['label'] = phrases[ j] except KeyError as e: raise e data = [] phrases = [] logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_progress', 'args': { 'user_name': request.user.username, 'run_id': new_run.id }, 'data': { 'permutations_processed': i + 1 - min_len, 'total_permutations': max_len - min_len + 1 } })) m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data) for j, response in enumerate(m_response): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase( sorted_phrase, request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = { 'total_freq': 0, 'mwes': [], 'display_name': { 'freq': 0, 'label': False }, 'id': group_counter } group_counter += 1 final[sorted_conceptualised_phrase][ 'total_freq'] += response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({ 'mwe': phrases[j], 'freq': response['hits']['total'], 'accepted': False, 'id': mwe_counter }) mwe_counter += 1 final[sorted_conceptualised_phrase]['mwes'].sort( reverse=True, key=lambda k: k['freq']) if response['hits']['total'] > final[ sorted_conceptualised_phrase]['display_name'][ 'freq']: final[sorted_conceptualised_phrase]['display_name'][ 'freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name'][ 'label'] = phrases[j] except KeyError as e: raise e for key in final: final[key]['concept_name'] = {'freq': -1, 'label': ''} r = Run.objects.get(pk=new_run.pk) r.run_completed = datetime.now() r.run_status = 'completed' r.results = json.dumps(final) r.save() logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_completed', 'args': { 'user_name': request.user.username, 'run_id': new_run.id } })) except Exception as e: print(e) logging.getLogger(ERROR_LOGGER).error(json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_failed', 'args': { 'user_name': request.user.username, 'run_id': new_run.id } }), exc_info=True)
def check_if_analyzer_exists(self): ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers() user_sent_analyzer = self.post_dict["analyzer"] available_analyzer_names = list(map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS)) if user_sent_analyzer not in available_analyzer_names: raise ValueError("Analyzer '{0}' not available. Available analyzers are: '{1}'".format(user_sent_analyzer, available_analyzer_names))
class AggManager: """ Manage Searcher aggregations and plotting preparations """ def __init__(self,request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results) @staticmethod def _get_daterange(es_params): daterange = {"min":es_params["agg_daterange_from_1"],"max":es_params["agg_daterange_to_1"]} return daterange @staticmethod def _get_date_intervals(daterange,interval): if daterange['min'] and daterange['max']: frmt = "%Y-%m-%d" start_datetime = datetime.strptime(daterange['min'],frmt) end_datetime = datetime.strptime(daterange['max'],frmt) if interval == 'year': rdelta = relativedelta(years=+1) elif interval == 'quarter': rdelta = relativedelta(months=+3) elif interval == 'month': rdelta = relativedelta(months=+1) elif interval == 'week': rdelta = relativedelta(weeks=+1) elif interval == 'day': rdelta = relativedelta(days=+1) next_calculated_datetime = start_datetime + rdelta dates = [start_datetime, next_calculated_datetime] labels = [start_datetime.strftime(frmt),next_calculated_datetime.strftime(frmt)] while next_calculated_datetime < end_datetime: next_calculated_datetime += rdelta dates.append(next_calculated_datetime) labels.append(next_calculated_datetime.strftime(frmt)) dates.append(end_datetime) labels.append(end_datetime.strftime(frmt)) dates_str = [] for i,date in enumerate(dates[1:]): dates_str.append({'from':dates[i].strftime(frmt),'to':date.strftime(frmt)}) return dates_str,labels else: return [],[] def prepare_agg_query(self): es_params = self.es_params agg_field_1 = es_params["agg_field_1"] agg_field_1 = json.loads(agg_field_1) sort_by_1 = es_params["sort_by_1"] agg_field_2 = es_params["agg_field_2"] agg_field_2 = json.loads(agg_field_2) sort_by_2 = es_params["sort_by_2"] try: agg_size_1 = int(es_params["agg_size_1"]) agg_size_2 = int(es_params["agg_size_2"]) except KeyError: agg_size_1 = 10 agg_size_2 = 10 field_type_to_name = {'date': 'daterange', 'string':'string', 'text': 'string', 'keyword': 'string', 'facts': 'fact', 'fact_str_val': 'fact_str_val', 'fact_num_val': 'fact_num_val'} agg_name_1 = field_type_to_name[agg_field_1['type']] agg_name_2 = field_type_to_name[agg_field_2['type']] # If aggregating over text field, use .keyword instead if agg_field_1['type'] == 'text' and sort_by_1 in ['terms', 'significant_terms']: # NEW PY REQUIREMENT agg_field_1['path'] = '{0}.keyword'.format(agg_field_1['path']) if agg_field_2['type'] == 'text' and sort_by_2 in ['terms', 'significant_terms']: # NEW PY REQUIREMENT agg_field_2['path'] = '{0}.keyword'.format(agg_field_2['path']) # 1st LEVEL AGGREGATION agg = self.create_agg(agg_name_1,sort_by_1,agg_field_1["path"],agg_size_1) if agg_name_1 == 'fact' and es_params["agg_field_2_selected"] == 'false': agg[agg_name_1]["aggs"][agg_name_1]['aggs']['fact_str_val'] = \ self.create_agg('fact_str_val', sort_by_1, agg_field_1['path'], agg_size_1)['fact_str_val']['aggs']['fact_str_val'] # 2nd LEVEL AGGREGATION if es_params["agg_field_2_selected"] == 'true': agg_2 = self.create_agg(agg_name_2,sort_by_2,agg_field_2["path"],agg_size_2) if agg_name_1 == 'fact' and agg_name_2 == 'fact_str_val': agg[agg_name_1]['aggs'][agg_name_1]['aggs'] = agg_2[agg_name_2]['aggs'] agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'] = {"reverse_nested": {}} elif 'fact' in agg_name_1 and agg_name_2 == 'string': agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents']['aggs'] = agg_2 else: if agg_name_2 == 'fact': agg[agg_name_1]["aggregations"] = agg_2 agg[agg_name_1]["aggregations"][agg_name_2]['aggs'][agg_name_2]['aggs'] = self.create_agg('fact_str_val', sort_by_2, agg_field_2['path'], agg_size_2)['fact_str_val']['aggs'] else: agg[agg_name_1]["aggregations"] = agg_2 return agg def create_agg(self, agg_name, sort_by, path, size): if agg_name == "daterange": return {agg_name: {"date_range": {"field": path, "format": date_format, "ranges": self.ranges}}} elif agg_name == 'fact': return { agg_name: { "nested": {"path": "texta_facts"}, "aggs": { agg_name: { sort_by: {"field": "texta_facts.fact", "size": size}, "aggs": {"documents": {"reverse_nested": {}}} } } } } elif agg_name == 'fact_str_val': return { agg_name: { "nested": {"path": "texta_facts"}, "aggs": { agg_name: { sort_by: {"field": "texta_facts.str_val", "size": size, 'order': {'documents.doc_count': 'desc'}}, "aggs": {"documents": {"reverse_nested": {}}} } } } } elif agg_name == 'fact_num_val': return { agg_name: { "nested": {"path": "texta_facts"}, "aggs": { agg_name: { sort_by: {"field": "texta_facts.num_val", "size": size, 'order': {'documents.doc_count': 'desc'}}, "aggs": {"documents": {"reverse_nested": {}}} } } } } else: return {agg_name: {sort_by: {"field": path, "size": size}}} def aggregate(self): responses = [] out = {} # EXECUTE SAVED SEARCHES for item in self.es_params: if 'saved_search' in item: s = Search.objects.get(pk=self.es_params[item]) name = s.description saved_query = json.loads(s.query) self.es_m.load_combined_query(saved_query) self.es_m.set_query_parameter("aggs", self.agg_query) response = self.es_m.search() responses.append({"id":"search_"+str(s.pk),"label":name,"response":response}) # EXECUTE THE LIVE QUERY if "ignore_active_search" not in self.es_params: self.es_m.build(self.es_params) self.es_m.set_query_parameter("aggs", self.agg_query) self.es_m.set_query_parameter("size", 0) response = self.es_m.search() #raise Exception(self.es_m.combined_query['main']['aggs']) responses.append({"id":"query","label":"Current Search","response":response}) out["responses"] = responses # EXECUTE EMPTY TIMELINE QUERY IF RELATIVE FREQUENCY SELECTED if json.loads(self.es_params["agg_field_1"])["type"] == "date" and self.es_params["freq_norm_1"] == "relative_frequency": empty_params = {} self.es_m.build(empty_params) self.es_m.set_query_parameter("aggs", self.agg_query) response = self.es_m.search() out["empty_timeline_response"] = response return out def parse_responses(self,agg_results): """ Parses ES responses into JSON structure and normalises daterange frequencies if necessary """ total_freqs = {} agg_data = [] if "empty_timeline_response" in agg_results: for bucket in agg_results["empty_timeline_response"]["aggregations"]["daterange"]["buckets"]: total_freqs[bucket["from_as_string"]] = bucket["doc_count"] for i,response in enumerate(agg_results["responses"]): aggs = response["response"]["aggregations"] output_type = None response_out = [] for agg_name,agg_results in aggs.items(): output_type = agg_name if agg_name == 'daterange': response_out.extend(self._parse_daterange_buckets(agg_results['buckets'], total_freqs, self.es_params['freq_norm_1'])) elif agg_name == 'string': response_out.extend(self._parse_string_buckets(agg_results['buckets'])) elif agg_name == 'fact': response_out.extend(self._parse_fact_buckets(agg_results['fact']['buckets'])) elif agg_name == 'fact_str_val' or agg_name == 'fact_num_val': response_out.extend(self._parse_fact_buckets(agg_results[agg_name]['buckets'])) agg_data.append({"data":response_out,"type":output_type,"label":response["label"]}) return agg_data def _parse_daterange_buckets(self, buckets, total_freqs, freq_norm_1): results = [] for bucket in buckets: new = {"children":[]} new["key"] = bucket["from_as_string"] # Normalises frequencies if freq_norm_1 == "relative_frequency": try: new["val"] = str(round(float(bucket["doc_count"])/float(total_freqs[bucket["from_as_string"]]),5)) except ZeroDivisionError: new["val"] = 0 else: new["val"] = bucket["doc_count"] if "string" in bucket: for bucket_2 in bucket["string"]["buckets"]: child = {} child["key"] = bucket_2["key"] child["val"] = bucket_2["doc_count"] new["children"].append(child) elif 'fact' in bucket: for inner_bucket in bucket['fact']['fact']['buckets']: child = {'key': inner_bucket['key'], 'val': inner_bucket['doc_count']} grandchildren = [] for super_inner_bucket in inner_bucket['fact_str_val']['buckets']: grandchildren.append({'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count']}) child['children'] = grandchildren new['children'].append(child) elif 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['fact_str_val']['buckets']: new['children'].append({'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count']}) results.append(new) return results def _parse_string_buckets(self, buckets): results = [] for bucket in buckets: new = {"children":[]} new["key"] = bucket["key"] new["val"] = bucket["doc_count"] if "string" in bucket: for bucket_2 in bucket["string"]["buckets"]: child = {} child["key"] = bucket_2["key"] child["val"] = bucket_2["doc_count"] new["children"].append(child) elif 'fact' in bucket: for inner_bucket in bucket['fact']['fact']['buckets']: child = {'key': inner_bucket['key'], 'val': inner_bucket['doc_count']} grandchildren = [] for super_inner_bucket in inner_bucket['fact_str_val']['buckets']: grandchildren.append({'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count']}) child['children'] = grandchildren new['children'].append(child) elif 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['fact_str_val']['buckets']: new['children'].append({'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count']}) results.append(new) return results def _parse_fact_buckets(self, buckets): results = [] for bucket in buckets: new = {"children": []} new["key"] = bucket["key"] new["val"] = bucket["documents"]["doc_count"] if 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['buckets']: child = {} child['key'] = inner_bucket['key'] child['val'] = inner_bucket['documents']['doc_count'] new['children'].append(child) elif 'documents' in bucket and 'string' in bucket['documents']: for inner_bucket in bucket['documents']['string']['buckets']: child = {} child['key'] = inner_bucket['key'] child['val'] = inner_bucket['doc_count'] new['children'].append(child) results.append(new) return results def _parse_fact_val_results(self, buckets): pass def output_to_searcher(self): count_dict = defaultdict(defaultdict) children_dict = defaultdict(dict) i = 0 data_out = [] for agg in self.agg_data: if agg["type"] == "daterange": i+=1 for row in agg["data"]: count_dict[row["key"]][i] = row["val"] if row["children"]: children_dict[row["key"]][i] = {"data":row["children"],"label":agg["label"]} else: data_out.append(agg) combined_daterange_data = [] labels = [a["label"] for a in self.agg_data] for row in sorted(count_dict.items(),key=lambda l:l[0]): new_row = dict(row[1]) new_row["date"] = row[0] combined_daterange_data.append(new_row) daterange_data = {"type":"daterange", "data":combined_daterange_data, "ykeys":list(range(1,i+1)), "labels":labels, "children":dict(children_dict)} if daterange_data["data"]: data_out.append(daterange_data) return data_out
def facts_agg(es_params, request): logger = LogManager(__name__, 'FACTS AGGREGATION') distinct_values = [] query_results = [] lexicon = [] aggregation_data = es_params['aggregate_over'] aggregation_data = json.loads(aggregation_data) original_aggregation_field = aggregation_data['path'] aggregation_field = 'texta_link.facts' try: aggregation_size = 50 aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}}, "distinct_values": {"cardinality": {"field": aggregation_field}}} # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() date_range = ds.get_date_range() es_m = ES_Manager(dataset, mapping, date_range) for item in es_params: if 'saved_search' in item: s = Search.objects.get(pk=es_params[item]) name = s.description saved_query = json.loads(s.query) es_m.load_combined_query(saved_query) es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':name,'data':normalised_counts,'labels':labels}) distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']}) es_m.build(es_params) # FIXME # this is confusing for the user if not es_m.is_combined_query_empty(): es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':'Query','data':normalised_counts,'labels':labels}) distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']}) data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))] data = [['Word']+[query_result['name'] for query_result in query_results]]+data for i,word in enumerate(lexicon): for j,query_result in enumerate(query_results): for k,label in enumerate(query_result['labels']): if word == label: data[i+1][j+1] = query_result['data'][k] logger.set_context('user_name', request.user.username) logger.info('facts_aggregation_queried') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.exception('facts_aggregation_query_failed') table_height = len(data)*15 table_height = table_height if table_height > 500 else 500 return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}
class Autocomplete: def __init__(self): self.es_m = None self.lookup_type = None self.key_constraints = None self.content = None self.user = None self.limit = None def parse_request(self, request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user def suggest(self, limit=10): self.limit = limit suggestions = {} for i, lookup_type in enumerate(self.lookup_types): if lookup_type == 'FACT_NAME': suggestions['FACT_NAME'] = self._get_facts('fact', lookup_type) elif lookup_type == 'FACT_VAL': suggestions['FACT_VAL'] = self._get_facts( 'str_val', lookup_type, key_constraint=self.key_constraints[i]) elif lookup_type == 'CONCEPT': suggestions['CONCEPT'] = self._get_concepts() elif lookup_type == 'LEXICON': suggestions['LEXICON'] = self._get_lexicons() return suggestions def _get_facts(self, agg_subfield, lookup_type, key_constraint=None): agg_query = { agg_subfield: { "nested": { "path": "texta_facts" }, "aggs": { agg_subfield: { "terms": { "field": "texta_facts.fact" }, "aggs": { "fact_values": { "terms": { "field": "texta_facts.str_val", "size": self.limit, "include": "{0}.*".format(self.content) } } } } } } } self.es_m.build('') self.es_m.set_query_parameter("aggs", agg_query) if lookup_type == 'FACT_VAL' and key_constraint: facts = [] for bucket in self.es_m.search( )["aggregations"][agg_subfield][agg_subfield]["buckets"]: if bucket["key"] == key_constraint: facts += [ self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"] ] elif lookup_type == 'FACT_VAL' and not key_constraint: facts = [] for bucket in self.es_m.search( )["aggregations"][agg_subfield][agg_subfield]["buckets"]: facts += [ self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"] ] else: facts = [ self._format_suggestion(a["key"], a["key"]) for a in self.es_m.search()["aggregations"][agg_subfield] [agg_subfield]["buckets"] ] return facts def _get_concepts(self): concepts = [] if len(self.content) > 0: terms = Term.objects.filter(term__startswith=self.content).filter( author=self.user) seen = {} for term in terms[:self.limit]: for term_concept in TermConcept.objects.filter(term=term.pk): concept = term_concept.concept concept_term = (concept.pk, term.term) if concept_term not in seen: seen[concept_term] = True display_term = term.term.replace( self.content, '<font color="red">' + self.content + '</font>') display_text = '<b>{0}</b>@C{1}-{2}'.format( display_term, concept.pk, concept.descriptive_term.term) suggestion = self._format_suggestion( concept.descriptive_term.term, display_text, resource_id=concept.pk) concepts.append(suggestion) return concepts def _get_lexicons(self): suggested_lexicons = [] if len(self.content) > 0: lexicons = Lexicon.objects.filter( name__startswith=self.content).filter(author=self.user) for lexicon in lexicons: display_term = lexicon.name.replace( self.content, '<font color="red">' + self.content + '</font>') display_text = '<b>{0}</b>@L{1}-{2}'.format( display_term, lexicon.pk, lexicon.name) suggestion = self._format_suggestion(lexicon.name, display_text, resource_id=lexicon.pk) suggested_lexicons.append(suggestion) return suggested_lexicons @staticmethod def _format_suggestion(entry_text, display_text, resource_id=''): return { 'entry_text': entry_text, 'display_text': display_text, 'resource_id': resource_id }
def get_next_page_data(query, es_from, last_page, query_data, request): start_from = None end = None rows = [] page_length = query_data['page_length'] if es_from == None: return { 'rows': rows, 'from': start_from, 'page': last_page, 'end': None, 'total': None } dataset = query_data['dataset'] mapping = query_data['mapping'] polarity = query_data['polarity'] inclusive_instructions = query_data['inclusive_instructions'] exclusive_instructions = query_data['exclusive_instructions'] query['from'] = es_from response = ES_Manager.plain_search(es_url, dataset, mapping, query) try: hit = response['hits']['hits'][0] feature_dict = { feature_name: hit['_source'][feature_name][0] for feature_name in hit['_source'] } sorted_feature_names = sorted(feature_dict) feature_to_idx_map = { feature: (feature_idx + 1) for feature_idx, feature in enumerate(sorted_feature_names) } except: pass hit_idx = page_length - 1 while len( rows) < page_length and 'hits' in response and 'hits' in response[ 'hits'] and response['hits'][ 'hits'] and hit_idx + 1 == page_length: for hit_idx, hit in enumerate(response['hits']['hits']): if len(rows) >= page_length: break feature_dict = {} for field_name in hit['_source']: field_value = hit['_source'][field_name] if isinstance(field_value, dict): for subfield_name, subfield_value in field_value.items(): combined_field_name = '{0}.{1}'.format( field_name, subfield_name) feature_dict[combined_field_name] = subfield_value else: feature_dict[field_name] = field_value sorted_feature_names = sorted(feature_dict) feature_to_idx_map = defaultdict(list) for feature_idx, feature in enumerate(sorted_feature_names): feature_to_idx_map[feature.split('.')[0]].append(feature_idx + 1) row = [hit['_id']] row.extend([ feature_dict[feature_name] for feature_name in sorted_feature_names ]) layer_dict = matcher.LayerDict(feature_dict) inclusive_matches = inclusive_instructions.match(layer_dict) if (polarity == 'positive') == bool( inclusive_matches ): # add row if polarity is positive and we have a match or negative and dont if len(rows) == 0: start_from = query['from'] + hit_idx end = query['from'] + hit_idx if inclusive_matches: row = highlight(row, feature_to_idx_map, inclusive_matches) rows.append(row) query['from'] = query['from'] + hit_idx + 1 if len(rows) >= page_length: break response = ES_Manager.plain_search(es_url, dataset, mapping, query) if end: GrammarPageMapping( search_id=query_data['search_id'], inclusive_grammar=query_data['inclusive_grammar_id'], exclusive_grammar=query_data['exclusive_grammar_id'], page=query_data['requested_page'], polarity=query_data['polarity'], elastic_start=start_from, elastic_end=end + 1, author=request.user).save() return { 'rows': rows, 'from': start_from, 'page': last_page + 1, 'end': (end + (0 if last_page == 0 else 1)) if end else end, 'total': response['hits']['total'] }
class AggManager: """ Manage Searcher aggregations and plotting preparations """ def __init__(self, request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges, self.date_labels = self._get_date_intervals( self.daterange, interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results) @staticmethod def _get_daterange(es_params): daterange = { "min": es_params["agg_daterange_from_1"], "max": es_params["agg_daterange_to_1"] } return daterange @staticmethod def _get_date_intervals(daterange, interval): if daterange['min'] and daterange['max']: frmt = "%Y-%m-%d" start_datetime = datetime.strptime(daterange['min'], frmt) end_datetime = datetime.strptime(daterange['max'], frmt) if interval == 'year': rdelta = relativedelta(years=+1) elif interval == 'quarter': rdelta = relativedelta(months=+3) elif interval == 'month': rdelta = relativedelta(months=+1) elif interval == 'week': rdelta = relativedelta(weeks=+1) elif interval == 'day': rdelta = relativedelta(days=+1) next_calculated_datetime = start_datetime + rdelta dates = [start_datetime, next_calculated_datetime] labels = [ start_datetime.strftime(frmt), next_calculated_datetime.strftime(frmt) ] while next_calculated_datetime < end_datetime: next_calculated_datetime += rdelta dates.append(next_calculated_datetime) labels.append(next_calculated_datetime.strftime(frmt)) dates.append(end_datetime) labels.append(end_datetime.strftime(frmt)) dates_str = [] for i, date in enumerate(dates[1:]): dates_str.append({ 'from': dates[i].strftime(frmt), 'to': date.strftime(frmt) }) return dates_str, labels else: return [], [] def prepare_agg_query(self): es_params = self.es_params agg_field_1 = es_params["agg_field_1"] agg_field_1 = json.loads(agg_field_1) sort_by_1 = es_params["sort_by_1"] agg_field_2 = es_params["agg_field_2"] agg_field_2 = json.loads(agg_field_2) sort_by_2 = es_params["sort_by_2"] try: agg_size_1 = int(es_params["agg_size_1"]) agg_size_2 = int(es_params["agg_size_2"]) except KeyError: agg_size_1 = 10 agg_size_2 = 10 field_type_to_name = { 'date': 'daterange', 'string': 'string', 'text': 'string', 'keyword': 'string', 'facts': 'fact', 'fact_str_val': 'fact_str_val', 'fact_num_val': 'fact_num_val' } agg_name_1 = field_type_to_name[agg_field_1['type']] agg_name_2 = field_type_to_name[agg_field_2['type']] # If aggregating over text field, use .keyword instead if agg_field_1['type'] == 'text' and sort_by_1 in [ 'terms', 'significant_terms' ]: # NEW PY REQUIREMENT agg_field_1['path'] = '{0}.keyword'.format(agg_field_1['path']) if agg_field_2['type'] == 'text' and sort_by_2 in [ 'terms', 'significant_terms' ]: # NEW PY REQUIREMENT agg_field_2['path'] = '{0}.keyword'.format(agg_field_2['path']) # 1st LEVEL AGGREGATION agg = self.create_agg(agg_name_1, sort_by_1, agg_field_1["path"], agg_size_1) if agg_name_1 == 'fact' and es_params[ "agg_field_2_selected"] == 'false': agg[agg_name_1]["aggs"][agg_name_1]['aggs']['fact_str_val'] = \ self.create_agg('fact_str_val', sort_by_1, agg_field_1['path'], agg_size_1)['fact_str_val']['aggs']['fact_str_val'] # 2nd LEVEL AGGREGATION if es_params["agg_field_2_selected"] == 'true': agg_2 = self.create_agg(agg_name_2, sort_by_2, agg_field_2["path"], agg_size_2) if agg_name_1 == 'fact' and agg_name_2 == 'fact_str_val': agg[agg_name_1]['aggs'][agg_name_1]['aggs'] = agg_2[ agg_name_2]['aggs'] agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'] = { "reverse_nested": {} } elif 'fact' in agg_name_1 and agg_name_2 == 'string': agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'][ 'aggs'] = agg_2 else: if agg_name_2 == 'fact': agg[agg_name_1]["aggregations"] = agg_2 agg[agg_name_1]["aggregations"][agg_name_2]['aggs'][ agg_name_2]['aggs'] = self.create_agg( 'fact_str_val', sort_by_2, agg_field_2['path'], agg_size_2)['fact_str_val']['aggs'] else: agg[agg_name_1]["aggregations"] = agg_2 return agg def create_agg(self, agg_name, sort_by, path, size): if agg_name == "daterange": return { agg_name: { "date_range": { "field": path, "format": date_format, "ranges": self.ranges } } } elif agg_name == 'fact': return { agg_name: { "nested": { "path": "texta_facts" }, "aggs": { agg_name: { sort_by: { "field": "texta_facts.fact", "size": size }, "aggs": { "documents": { "reverse_nested": {} } } } } } } elif agg_name == 'fact_str_val': return { agg_name: { "nested": { "path": "texta_facts" }, "aggs": { agg_name: { sort_by: { "field": "texta_facts.str_val", "size": size, 'order': { 'documents.doc_count': 'desc' } }, "aggs": { "documents": { "reverse_nested": {} } } } } } } elif agg_name == 'fact_num_val': return { agg_name: { "nested": { "path": "texta_facts" }, "aggs": { agg_name: { sort_by: { "field": "texta_facts.num_val", "size": size, 'order': { 'documents.doc_count': 'desc' } }, "aggs": { "documents": { "reverse_nested": {} } } } } } } else: return {agg_name: {sort_by: {"field": path, "size": size}}} def aggregate(self): responses = [] out = {} # EXECUTE SAVED SEARCHES for item in self.es_params: if 'saved_search' in item: s = Search.objects.get(pk=self.es_params[item]) name = s.description saved_query = json.loads(s.query) self.es_m.load_combined_query(saved_query) self.es_m.set_query_parameter("aggs", self.agg_query) response = self.es_m.search() responses.append({ "id": "search_" + str(s.pk), "label": name, "response": response }) # EXECUTE THE LIVE QUERY if "ignore_active_search" not in self.es_params: self.es_m.build(self.es_params) self.es_m.set_query_parameter("aggs", self.agg_query) self.es_m.set_query_parameter("size", 0) response = self.es_m.search() #raise Exception(self.es_m.combined_query['main']['aggs']) responses.append({ "id": "query", "label": "Current Search", "response": response }) out["responses"] = responses # EXECUTE EMPTY TIMELINE QUERY IF RELATIVE FREQUENCY SELECTED if json.loads(self.es_params["agg_field_1"] )["type"] == "date" and self.es_params[ "freq_norm_1"] == "relative_frequency": empty_params = {} self.es_m.build(empty_params) self.es_m.set_query_parameter("aggs", self.agg_query) response = self.es_m.search() out["empty_timeline_response"] = response return out def parse_responses(self, agg_results): """ Parses ES responses into JSON structure and normalises daterange frequencies if necessary """ total_freqs = {} agg_data = [] if "empty_timeline_response" in agg_results: for bucket in agg_results["empty_timeline_response"][ "aggregations"]["daterange"]["buckets"]: total_freqs[bucket["from_as_string"]] = bucket["doc_count"] for i, response in enumerate(agg_results["responses"]): aggs = response["response"]["aggregations"] output_type = None response_out = [] for agg_name, agg_results in aggs.items(): output_type = agg_name if agg_name == 'daterange': response_out.extend( self._parse_daterange_buckets( agg_results['buckets'], total_freqs, self.es_params['freq_norm_1'])) elif agg_name == 'string': response_out.extend( self._parse_string_buckets(agg_results['buckets'])) elif agg_name == 'fact': response_out.extend( self._parse_fact_buckets( agg_results['fact']['buckets'])) elif agg_name == 'fact_str_val' or agg_name == 'fact_num_val': response_out.extend( self._parse_fact_buckets( agg_results[agg_name]['buckets'])) agg_data.append({ "data": response_out, "type": output_type, "label": response["label"] }) return agg_data def _parse_daterange_buckets(self, buckets, total_freqs, freq_norm_1): results = [] for bucket in buckets: new = {"children": []} new["key"] = bucket["from_as_string"] # Normalises frequencies if freq_norm_1 == "relative_frequency": try: new["val"] = str( round( float(bucket["doc_count"]) / float(total_freqs[bucket["from_as_string"]]), 5)) except ZeroDivisionError: new["val"] = 0 else: new["val"] = bucket["doc_count"] if "string" in bucket: for bucket_2 in bucket["string"]["buckets"]: child = {} child["key"] = bucket_2["key"] child["val"] = bucket_2["doc_count"] new["children"].append(child) elif 'fact' in bucket: for inner_bucket in bucket['fact']['fact']['buckets']: child = { 'key': inner_bucket['key'], 'val': inner_bucket['doc_count'] } grandchildren = [] for super_inner_bucket in inner_bucket['fact_str_val'][ 'buckets']: grandchildren.append({ 'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count'] }) child['children'] = grandchildren new['children'].append(child) elif 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['fact_str_val'][ 'buckets']: new['children'].append({ 'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count'] }) results.append(new) return results def _parse_string_buckets(self, buckets): results = [] for bucket in buckets: new = {"children": []} new["key"] = bucket["key"] new["val"] = bucket["doc_count"] if "string" in bucket: for bucket_2 in bucket["string"]["buckets"]: child = {} child["key"] = bucket_2["key"] child["val"] = bucket_2["doc_count"] new["children"].append(child) elif 'fact' in bucket: for inner_bucket in bucket['fact']['fact']['buckets']: child = { 'key': inner_bucket['key'], 'val': inner_bucket['doc_count'] } grandchildren = [] for super_inner_bucket in inner_bucket['fact_str_val'][ 'buckets']: grandchildren.append({ 'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count'] }) child['children'] = grandchildren new['children'].append(child) elif 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['fact_str_val'][ 'buckets']: new['children'].append({ 'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count'] }) results.append(new) return results def _parse_fact_buckets(self, buckets): results = [] for bucket in buckets: new = {"children": []} new["key"] = bucket["key"] new["val"] = bucket["documents"]["doc_count"] if 'fact_str_val' in bucket: for inner_bucket in bucket['fact_str_val']['buckets']: child = {} child['key'] = inner_bucket['key'] child['val'] = inner_bucket['documents']['doc_count'] new['children'].append(child) elif 'documents' in bucket and 'string' in bucket['documents']: for inner_bucket in bucket['documents']['string']['buckets']: child = {} child['key'] = inner_bucket['key'] child['val'] = inner_bucket['doc_count'] new['children'].append(child) results.append(new) return results def _parse_fact_val_results(self, buckets): pass def output_to_searcher(self): count_dict = defaultdict(defaultdict) children_dict = defaultdict(dict) i = 0 data_out = [] for agg in self.agg_data: if agg["type"] == "daterange": i += 1 for row in agg["data"]: count_dict[row["key"]][i] = row["val"] if row["children"]: children_dict[row["key"]][i] = { "data": row["children"], "label": agg["label"] } else: data_out.append(agg) combined_daterange_data = [] labels = [a["label"] for a in self.agg_data] for row in sorted(count_dict.items(), key=lambda l: l[0]): new_row = dict(row[1]) new_row["date"] = row[0] combined_daterange_data.append(new_row) daterange_data = { "type": "daterange", "data": combined_daterange_data, "ykeys": list(range(1, i + 1)), "labels": labels, "children": dict(children_dict) } if daterange_data["data"]: data_out.append(daterange_data) return data_out
class FactManager: """ Manage Searcher facts, like deleting/storing, adding facts. """ def __init__(self, request): self.es_params = request.POST self.ds = Datasets().activate_dataset(request.session) self.index = self.ds.get_index() self.mapping = self.ds.get_mapping() self.es_m = ES_Manager(self.index, self.mapping) self.field = 'texta_facts' def remove_facts_from_document(self, rm_facts_dict, bs=7500): '''remove a certain fact from all documents given a [str]key and [str]val''' logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS') try: # Clears readonly block just in case the index has been set to read only self.es_m.clear_readonly_block() query = self._fact_deletion_query(rm_facts_dict) self.es_m.load_combined_query(query) response = self.es_m.scroll(size=bs, field_scroll=self.field) scroll_id = response['_scroll_id'] total_docs = response['hits']['total'] docs_left = total_docs # DEBUG print('Starting.. Total docs - ', total_docs) # DEBUG batch = 0 while total_docs > 0: print('Docs left:', docs_left) # DEBUG data = '' for document in response['hits']['hits']: new_field = [] # The new facts field for fact in document['_source'][self.field]: # If the fact name is in rm_facts_dict keys if fact["fact"] in rm_facts_dict: # If the fact value is not in the delete key values if fact['str_val'] not in rm_facts_dict.getlist( fact["fact"]): new_field.append(fact) else: new_field.append(fact) # Update dataset data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: new_field}} data += json.dumps(document) + '\n' response = self.es_m.scroll(scroll_id=scroll_id, size=bs, field_scroll=self.field) total_docs = len(response['hits']['hits']) docs_left -= bs # DEBUG scroll_id = response['_scroll_id'] self.es_m.plain_post_bulk(self.es_m.es_url, data) print('DONE') # DEBUG logger.set_context('docs_left', total_docs) logger.set_context('batch', batch) logger.info('remove_facts_from_document') except: print(traceback.format_exc()) logger.set_context('es_params', self.es_params) logger.exception('remove_facts_from_document_failed') def tag_documents_with_fact(self, es_params, tag_name, tag_value, tag_field): '''Used to tag all documents in the current search with a certain fact''' self.es_m.build(es_params) self.es_m.load_combined_query(self.es_m.combined_query) response = self.es_m.scroll() data = '' for document in response['hits']['hits']: if 'mlp' in tag_field: split_field = tag_field.split('.') span = [ 0, len(document['_source'][split_field[0]][split_field[1]]) ] else: span = [0, len(document['_source'][tag_field].strip())] document['_source'][self.field].append({ "str_val": tag_value, "spans": str([span]), "fact": tag_name, "doc_path": tag_field }) data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: document['_source'][self.field]}} data += json.dumps(document) + '\n' self.es_m.plain_post_bulk(self.es_m.es_url, data) response = requests.post( '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format( self.es_m.es_url, self.index), headers=self.es_m.HEADERS) def count_cooccurrences(self, fact_pairs): """Finds the counts of cooccuring facts Arguments: fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))] Returns: [int list] -- Occurances of the given facts """ queries = [] for fact_pair in fact_pairs: fact_constraints = [] for fact in fact_pair: constraint = { "nested": { "path": "texta_facts", "query": { "bool": { "must": [{ "term": { "texta_facts.fact": fact[0] } }, { "term": { "texta_facts.str_val": fact[1] } }] } } } } fact_constraints.append(constraint) query = {"query": {"bool": {"must": fact_constraints}}, "size": 0} queries.append(json.dumps(query)) header = json.dumps({"index": self.index}) data = "\n".join(["{0}\n{1}".format(header, q) for q in queries]) + "\n" responses = requests.post("{0}/{1}/_msearch".format( self.es_m.es_url, self.index), data=data, headers={"Content-Type": "application/json"}) counts = [ response["hits"]["total"] for response in responses.json()['responses'] ] return counts def facts_via_aggregation(self, size=15): """Finds all facts from current search. Parameters: size - [int=15] -- Amount of fact values per fact name to search in query Returns: facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}} fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))] unique_fact_names - [list of string] -- All unique fact names """ aggs = { "facts": { "nested": { "path": "texta_facts" }, "aggs": { "fact_names": { "terms": { "field": "texta_facts.fact" }, "aggs": { "fact_values": { "terms": { "field": "texta_facts.str_val", "size": size } } } } } } } self.es_m.build(self.es_params) self.es_m.set_query_parameter('aggs', aggs) response = self.es_m.search() response_aggs = response['aggregations']['facts']['fact_names'][ 'buckets'] facts = {} fact_combinations = [] fact_count = 0 unique_fact_names = [] for bucket in response_aggs: unique_fact_names.append(bucket['key']) for fact in bucket['fact_values']['buckets']: facts[bucket['key'] + " - " + fact['key']] = { 'id': fact_count, 'name': bucket['key'], 'value': fact['key'], 'doc_count': fact['doc_count'] } fact_combinations.append((bucket['key'], fact['key'])) fact_count += 1 fact_combinations = [ x for x in itertools.combinations(fact_combinations, 2) ] return (facts, fact_combinations, unique_fact_names) def fact_graph(self, search_size): facts, fact_combinations, unique_fact_names = self.facts_via_aggregation( size=search_size) # Get cooccurrences and remove values with 0 fact_combinations = { k: v for k, v in dict( zip(fact_combinations, self.count_cooccurrences(fact_combinations))).items() if v != 0 } shapes = [ "circle", "cross", "diamond", "square", "triangle-down", "triangle-up" ] types = dict(zip(unique_fact_names, itertools.cycle(shapes))) nodes = [] for i, fact in enumerate(facts): nodes.append({ "source": facts[fact]['id'], "size": facts[fact]['doc_count'], "score": facts[fact]['doc_count'], "name": facts[fact]['name'], "id": facts[fact]['value'], "type": types[facts[fact]['name']] }) # Track max/min count count = facts[fact]['doc_count'] if i == 0: max_node_size = count min_node_size = count max_node_size = max(max_node_size, count) min_node_size = min(min_node_size, count) links = [] max_link_size = 0 for fact in fact_combinations.keys(): max_link_size = max(max_link_size, fact_combinations[fact]) links.append({ "source": facts[fact[0][0] + " - " + fact[0][1]]['id'], "target": facts[fact[1][0] + " - " + fact[1][1]]['id'], "count": fact_combinations[fact] }) graph_data = json.dumps({"nodes": nodes, "links": links}) return (graph_data, unique_fact_names, max_node_size, max_link_size, min_node_size) def _fact_deletion_query(self, rm_facts_dict): '''Creates the query for fact deletion based on dict of facts {name: val}''' fact_queries = [] for key in rm_facts_dict: for val in rm_facts_dict.getlist(key): fact_queries.append({ "bool": { "must": [{ "match": { self.field + ".fact": key } }, { "match": { self.field + ".str_val": val } }] } }) query = { "main": { "query": { "nested": { "path": self.field, "query": { "bool": { "should": fact_queries } } } }, "_source": [self.field] } } return query
class EsDataClassification(object): def __init__(self, es_index, es_mapping, field, query): # Dataset info self.es_index = es_index self.es_mapping = es_mapping self.field = field # Build ES manager self.es_m = ES_Manager(es_index, es_mapping) self.es_m.load_combined_query(query) def get_total_documents(self): return self.es_m.get_total_documents() def get_tags_by_id(self, doc_id): request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id) response = ES_Manager.plain_get(request_url) if 'texta_tags' in response['_source']: tags = response['_source']['texta_tags'] else: tags = "" return tags.split() def apply_classifiers(self, classifiers, classifier_tags): if not isinstance(classifiers, list): classifiers = [classifiers] if not isinstance(classifier_tags, list): classifier_tags = [classifier_tags] response = self.es_m.scroll() scroll_id = response['_scroll_id'] total_hits = response['hits']['total'] total_processed = 0 positive_docs = [] positive_docs_batch = [] batch_size = 1000 # Get all positive documents while total_hits > 0: # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg = 'Elasticsearch failed to retrieve documents: ' \ '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'], response['timed_out'], response['took']) raise EsIteratorError(msg) for hit in response['hits']['hits']: positive_docs_batch.append(((str(hit['_id'])), hit['_source'])) if len(positive_docs_batch) >= batch_size: positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags) positive_docs_batch = [] total_processed += len(positive_docs_batch) # New scroll request response = self.es_m.scroll(scroll_id=scroll_id) total_hits = len(response['hits']['hits']) if positive_docs_batch: positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags) total_processed += len(positive_docs_batch) data = {} data['total_processed'] = total_processed data['total_positive'] = positive_docs_per_classifier[0] if len(classifiers) == 1 else positive_docs_per_classifier if len(classifiers) == 1: data['total_negative'] = total_processed - positive_docs_per_classifier[0] else: data['total_negative'] = [ total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier ] data['total_documents'] = self.get_total_documents() return data def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags): """ :param documents: list of (doc_id, document) entries :return: None """ field_path_components = self.field.split('.') fields_data = [] for document in documents: # Traverse the nested fields to reach the sought input text/data for the classifier field_data = document[1] for field_path_component in field_path_components: field_data = field_data[field_path_component] fields_data.append(field_data) positive_docs = [] classifiers_predictions = [] for classifier in classifiers: predictions = classifier.predict(fields_data) classifiers_predictions.append(predictions) positive_docs.append(sum(predictions)) bulk_update_content = [] for document_idx, document in enumerate(documents): document_id, document = document if 'texta_tags' in document: tags = set([tag.strip() for tag in document['texta_tags'].split('\n')]) else: tags = set() new_tags = False for classifier_idx, classifier_predictions in enumerate(classifiers_predictions): if classifier_predictions[document_idx] == 1: tag_count_before = len(tags) tags.add(classifier_tags[classifier_idx]) new_tags = len(tags) > tag_count_before if new_tags: bulk_update_content.append(json.dumps({ 'update': { '_id': document_id, '_index': self.es_index, '_type': self.es_mapping } })) bulk_update_content.append(json.dumps({ 'doc': { 'texta_tags': '\n'.join(sorted(tags)) } })) bulk_update_content.append('') bulk_update_content = '\n'.join(bulk_update_content) self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content) return positive_docs
def get_next_page_data(query, es_from, last_page, query_data, request): start_from = None end = None rows = [] page_length = query_data['page_length'] if es_from == None: return {'rows':rows,'from':start_from,'page':last_page,'end':None, 'total':None} dataset = query_data['dataset'] mapping = query_data['mapping'] polarity = query_data['polarity'] inclusive_instructions = query_data['inclusive_instructions'] exclusive_instructions = query_data['exclusive_instructions'] query['from'] = es_from response = ES_Manager.plain_search(es_url, dataset, mapping, query) try: hit = response['hits']['hits'][0] feature_dict = {feature_name:hit['_source'][feature_name][0] for feature_name in hit['_source']} sorted_feature_names = sorted(feature_dict) feature_to_idx_map = {feature: (feature_idx+1) for feature_idx, feature in enumerate(sorted_feature_names)} except: pass hit_idx = page_length-1 while len(rows) < page_length and 'hits' in response and 'hits' in response['hits'] and response['hits']['hits'] and hit_idx+1 == page_length: for hit_idx, hit in enumerate(response['hits']['hits']): if len(rows) >= page_length: break feature_dict = {} for field_name in hit['_source']: field_value = hit['_source'][field_name] if isinstance(field_value, dict): for subfield_name, subfield_value in field_value.items(): combined_field_name = '{0}.{1}'.format(field_name, subfield_name) feature_dict[combined_field_name] = subfield_value else: feature_dict[field_name] = field_value sorted_feature_names = sorted(feature_dict) feature_to_idx_map = defaultdict(list) for feature_idx, feature in enumerate(sorted_feature_names): feature_to_idx_map[feature.split('.')[0]].append(feature_idx+1) row = [hit['_id']] row.extend([feature_dict[feature_name] for feature_name in sorted_feature_names]) layer_dict = matcher.LayerDict(feature_dict) inclusive_matches = inclusive_instructions.match(layer_dict) if (polarity == 'positive') == bool(inclusive_matches): # add row if polarity is positive and we have a match or negative and dont if len(rows) == 0: start_from = query['from'] + hit_idx end = query['from'] + hit_idx if inclusive_matches: row = highlight(row, feature_to_idx_map, inclusive_matches) rows.append(row) query['from'] = query['from'] + hit_idx + 1 if len(rows) >= page_length: break response = ES_Manager.plain_search(es_url, dataset, mapping,query) if end: GrammarPageMapping(search_id=query_data['search_id'], inclusive_grammar=query_data['inclusive_grammar_id'], exclusive_grammar=query_data['exclusive_grammar_id'], page=query_data['requested_page'], polarity=query_data['polarity'], elastic_start=start_from, elastic_end=end+1, author=request.user).save() return {'rows':rows,'from':start_from,'page':last_page+1,'end':(end+(0 if last_page == 0 else 1)) if end else end, 'total':response['hits']['total']}
def find_mappings(request): try: slop = int(request.POST['slop']) max_len = int(request.POST['max_len']) min_len = int(request.POST['min_len']) min_freq = int(request.POST['min_freq']) match_field = request.POST['match_field'] description = request.POST['description'] batch_size = 50 # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() lexicon = [] word_index = {} num_lexicons = 0 for i,lexicon_id in enumerate(request.POST.getlist('lexicons[]')): num_lexicons +=1 for word in Word.objects.filter(lexicon=lexicon_id): word = word.wrd lexicon.append(word) if word not in word_index: word_index[word] = [] word_index[word].append(i) lexicon = list(set(lexicon)) if min_len > num_lexicons: min_len = num_lexicons mwe_counter = 0 group_counter = 0 phrases = [] final = {} data = [] new_run = Run(minimum_frequency=min_freq,maximum_length=max_len,minimum_length=min_len,run_status='running',run_started=datetime.now(),run_completed=None,user=request.user,description=description) new_run.save() logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_started','args':{'user_name':request.user.username,'run_id':new_run.id,'slop':slop,'min_len':min_len,'max_len':max_len,'min_freq':min_freq,'match_field':match_field,'desc':description}})) for i in range(min_len,max_len+1): print('Permutation len:',i) for permutation in itertools.permutations(lexicon,i): word_indices = list(flatten([word_index[word] for word in permutation])) if len(word_indices) == len(set(word_indices)): permutation = ' '.join(permutation) if slop > 0: query = {"query": {"match_phrase": {match_field: {"query": permutation,"slop": slop}}}} else: query = {"query": {"match_phrase": {match_field: {"query": permutation}}}} data.append(json.dumps({"index":dataset,"mapping":mapping})+'\n'+json.dumps(query)) phrases.append(permutation) if len(data) == batch_size: for j,response in enumerate(ES_Manager.plain_multisearch(es_url, dataset, mapping, data)): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter} group_counter+=1 final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter}) mwe_counter+=1 final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq']) if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']: final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j] except KeyError as e: raise e data = [] phrases = [] logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_progress','args':{'user_name':request.user.username,'run_id':new_run.id},'data':{'permutations_processed':i+1-min_len,'total_permutations':max_len-min_len+1}})) m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data) for j,response in enumerate(m_response): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter} group_counter+=1 final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter}) mwe_counter+=1 final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq']) if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']: final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j] except KeyError as e: raise e for key in final: final[key]['concept_name'] = {'freq':-1,'label':''} r = Run.objects.get(pk=new_run.pk) r.run_completed = datetime.now() r.run_status = 'completed' r.results =json.dumps(final) r.save() logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_completed','args':{'user_name':request.user.username,'run_id':new_run.id}})) except Exception as e: print(e) logging.getLogger(ERROR_LOGGER).error(json.dumps({'process':'MINE MWEs','event':'mwe_mining_failed','args':{'user_name':request.user.username,'run_id':new_run.id}}),exc_info=True)
def get_allowed_datasets(self, user): indices = ES_Manager.get_indices() datasets = self.sort_datasets(indices) #print(datasets) return [dataset for dataset in datasets if user.has_perm('permission_admin.can_access_dataset_' + str(dataset['id']))]
class EsDataClassification(object): def __init__(self, es_index, es_mapping, field, query): # Dataset info self.es_index = es_index self.es_mapping = es_mapping self.field = field # Build ES manager self.es_m = ES_Manager(es_index, es_mapping) self.es_m.load_combined_query(query) def get_total_documents(self): return self.es_m.get_total_documents() def get_tags_by_id(self, doc_id): request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id) response = ES_Manager.plain_get(request_url) if 'texta_tags' in response['_source']: tags = response['_source']['texta_tags'] else: tags = "" return tags.split() def apply_classifiers(self, classifiers, classifier_tags): if not isinstance(classifiers, list): classifiers = [classifiers] if not isinstance(classifier_tags, list): classifier_tags = [classifier_tags] response = self.es_m.scroll() scroll_id = response['_scroll_id'] total_hits = response['hits']['total'] total_processed = 0 positive_docs = [] positive_docs_batch = [] batch_size = 1000 # Get all positive documents while total_hits > 0: # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg = 'Elasticsearch failed to retrieve documents: ' \ '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'], response['timed_out'], response['took']) raise EsIteratorError(msg) for hit in response['hits']['hits']: positive_docs_batch.append(((str(hit['_id'])), hit['_source'])) if len(positive_docs_batch) >= batch_size: positive_docs_per_classifier = self._apply_classifiers_to_documents( positive_docs_batch, classifiers, classifier_tags) positive_docs_batch = [] total_processed += len(positive_docs_batch) # New scroll request response = self.es_m.scroll(scroll_id=scroll_id) total_hits = len(response['hits']['hits']) if positive_docs_batch: positive_docs_per_classifier = self._apply_classifiers_to_documents( positive_docs_batch, classifiers, classifier_tags) total_processed += len(positive_docs_batch) data = {} data['total_processed'] = total_processed data['total_positive'] = positive_docs_per_classifier[0] if len( classifiers) == 1 else positive_docs_per_classifier if len(classifiers) == 1: data[ 'total_negative'] = total_processed - positive_docs_per_classifier[ 0] else: data['total_negative'] = [ total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier ] data['total_documents'] = self.get_total_documents() return data def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags): """ :param documents: list of (doc_id, document) entries :return: None """ field_path_components = self.field.split('.') fields_data = [] for document in documents: # Traverse the nested fields to reach the sought input text/data for the classifier field_data = document[1] for field_path_component in field_path_components: field_data = field_data[field_path_component] fields_data.append(field_data) positive_docs = [] classifiers_predictions = [] for classifier in classifiers: predictions = classifier.predict(fields_data) classifiers_predictions.append(predictions) positive_docs.append(sum(predictions)) bulk_update_content = [] for document_idx, document in enumerate(documents): document_id, document = document if 'texta_tags' in document: tags = set([ tag.strip() for tag in document['texta_tags'].split('\n') ]) else: tags = set() new_tags = False for classifier_idx, classifier_predictions in enumerate( classifiers_predictions): if classifier_predictions[document_idx] == 1: tag_count_before = len(tags) tags.add(classifier_tags[classifier_idx]) new_tags = len(tags) > tag_count_before if new_tags: bulk_update_content.append( json.dumps({ 'update': { '_id': document_id, '_index': self.es_index, '_type': self.es_mapping } })) bulk_update_content.append( json.dumps( {'doc': { 'texta_tags': '\n'.join(sorted(tags)) }})) bulk_update_content.append('') bulk_update_content = '\n'.join(bulk_update_content) self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content) return positive_docs
class Autocomplete: def __init__(self): self.es_m = None self.lookup_type = None self.key_constraints = None self.content = None self.user = None self.limit = None def parse_request(self,request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user def suggest(self,limit=10): self.limit = limit suggestions = {} for i,lookup_type in enumerate(self.lookup_types): if lookup_type == 'FACT_NAME': suggestions['FACT_NAME'] = self._get_facts('fact', lookup_type) elif lookup_type == 'FACT_VAL': suggestions['FACT_VAL'] = self._get_facts('str_val', lookup_type, key_constraint=self.key_constraints[i]) elif lookup_type == 'CONCEPT': suggestions['CONCEPT'] = self._get_concepts() elif lookup_type == 'LEXICON': suggestions['LEXICON'] = self._get_lexicons() return suggestions def _get_facts(self, agg_subfield, lookup_type, key_constraint=None): agg_query = {agg_subfield: {"nested": {"path": "texta_facts"}, "aggs": {agg_subfield: {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": "{0}.*".format(self.content)}}}}}}} self.es_m.build('') self.es_m.set_query_parameter("aggs", agg_query) if lookup_type == 'FACT_VAL' and key_constraint: facts = [] for bucket in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]: if bucket["key"] == key_constraint: facts += [self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"]] elif lookup_type == 'FACT_VAL' and not key_constraint: facts = [] for bucket in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]: facts += [self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"]] else: facts = [self._format_suggestion(a["key"],a["key"]) for a in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]] return facts def _get_concepts(self): concepts = [] if len(self.content) > 0: terms = Term.objects.filter(term__startswith=self.content).filter(author=self.user) seen = {} for term in terms[:self.limit]: for term_concept in TermConcept.objects.filter(term=term.pk): concept = term_concept.concept concept_term = (concept.pk,term.term) if concept_term not in seen: seen[concept_term] = True display_term = term.term.replace(self.content,'<font color="red">'+self.content+'</font>') display_text = '<b>{0}</b>@C{1}-{2}'.format(display_term,concept.pk,concept.descriptive_term.term) suggestion = self._format_suggestion(concept.descriptive_term.term,display_text,resource_id=concept.pk) concepts.append(suggestion) return concepts def _get_lexicons(self): suggested_lexicons = [] if len(self.content) > 0: lexicons = Lexicon.objects.filter(name__startswith=self.content).filter(author=self.user) for lexicon in lexicons: display_term = lexicon.name.replace(self.content,'<font color="red">'+self.content+'</font>') display_text = '<b>{0}</b>@L{1}-{2}'.format(display_term,lexicon.pk,lexicon.name) suggestion = self._format_suggestion(lexicon.name,display_text,resource_id=lexicon.pk) suggested_lexicons.append(suggestion) return suggested_lexicons @staticmethod def _format_suggestion(entry_text,display_text,resource_id=''): return {'entry_text':entry_text,'display_text':display_text,'resource_id':resource_id}