def get_all_matched_rows(query, request, inclusive_instructions): buffer_ = StringIO() writer = csv.writer(buffer_) ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) features = sorted([field['path'] for field in es_m.get_mapped_fields()]) query['size'] = ES_SCROLL_BATCH writer.writerow(features) ds.get_index() ds.get_mapping() es_url request_url = os.path.join(es_url, ds.get_index(), ds.get_mapping(), '_search?scroll=1m') response = requests.get(request_url, data=json.dumps(query)).json() scroll_id = response['_scroll_id'] hits = response['hits']['hits'] scroll_payload = json.dumps({'scroll':'1m', 'scroll_id':scroll_id}) while hits: for hit in hits: feature_dict = {feature_name:hit['_source'][feature_name] for feature_name in hit['_source']} feature_dict = {} row = [] for feature_name in features: feature_path = feature_name.split('.') parent_source = hit['_source'] for path_component in feature_path: if path_component in parent_source: parent_source = parent_source[path_component] else: parent_source = "" break content = parent_source row.append(content) feature_dict[feature_name] = content layer_dict = matcher.LayerDict(feature_dict) if inclusive_instructions.match(layer_dict): writer.writerow([element.encode('utf-8') if isinstance(element,unicode) else element for element in row]) buffer_.seek(0) data = buffer_.read() buffer_.seek(0) buffer_.truncate() yield data response = requests.get(os.path.join(es_url,'_search','scroll'), data=scroll_payload).json() hits = response['hits']['hits'] scroll_id = response['_scroll_id']
def index(request): # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) searches = [{ 'id': search.pk, 'desc': search.description } for search in Search.objects.filter( author=request.user, dataset__index=dataset, dataset__mapping=mapping)] datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status=Task.STATUS_COMPLETED).order_by('-pk') template = loader.get_template('grammar_builder.html') return HttpResponse( template.render( { 'STATIC_URL': STATIC_URL, 'searches': searches, 'features': fields, 'language_models': language_models, 'allowed_datasets': datasets }, request))
def save_grammar(request): grammar_dict = json.loads(request.POST['json']) grammar_id = grammar_dict[0]['id'] if grammar_id == 'new': name = grammar_dict[0]['text'] ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammar = Grammar(name=name, json='', author=request.user, dataset=Dataset.objects.filter(index=dataset, mapping=mapping)[0]) grammar.save() grammar_dict[0]['id'] = grammar.id else: grammar = Grammar.objects.get(id=grammar_id) grammar.json = json.dumps(grammar_dict) grammar.save() return HttpResponse(json.dumps({'id': grammar.id}))
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size": 10, "highlight": { "fields": { field: {} } }, "query": { "match": { field: value } } }) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def get_grammar_listing(request): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammars = Grammar.objects.filter(author=request.user, dataset__index=dataset, dataset__mapping=mapping).order_by('-last_modified') grammar_json = json.dumps([{'id':grammar.id, 'name':grammar.name, 'last_modified':grammar.last_modified.strftime("%d/%m/%y %H:%M:%S")} for grammar in grammars]) return HttpResponse(grammar_json)
def parse_request(self,request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def parse_request(self, request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}}) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def get_table_header(request): ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) # get columns names from ES mapping fields = es_m.get_column_names() template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'columns': [{'index':index, 'name':field_name} for index, field_name in enumerate(fields)], 'dataset': ds.get_index(), 'mapping': ds.get_mapping()} template = loader.get_template('searcher_results.html') return HttpResponse(template.render(template_params, request))
def get_grammar_listing(request): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammars = Grammar.objects.filter( author=request.user, dataset__index=dataset, dataset__mapping=mapping).order_by('-last_modified') grammar_json = json.dumps([{ 'id': grammar.id, 'name': grammar.name, 'last_modified': grammar.last_modified.strftime("%d/%m/%y %H:%M:%S") } for grammar in grammars]) return HttpResponse(grammar_json)
def __init__(self,request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results)
def __init__(self, request): ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) # PREPARE AGGREGATION self.es_params = request.POST interval = self.es_params["interval_1"] self.daterange = self._get_daterange(self.es_params) self.ranges, self.date_labels = self._get_date_intervals( self.daterange, interval) self.agg_query = self.prepare_agg_query() # EXECUTE AGGREGATION agg_results = self.aggregate() # PARSE RESPONSES INTO JSON OBJECT self.agg_data = self.parse_responses(agg_results)
def index(request): # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) searches = [{'id':search.pk,'desc':search.description} for search in Search.objects.filter(author=request.user, dataset__index=dataset, dataset__mapping=mapping)] datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter(task_type='train_model').filter(status__iexact='completed').order_by('-pk') template = loader.get_template('grammar_builder.html') return HttpResponse(template.render({'STATIC_URL':STATIC_URL, 'searches':searches, 'features':fields, 'language_models': language_models, 'allowed_datasets': datasets},request))
def save_grammar(request): grammar_dict = json.loads(request.POST['json']) grammar_id = grammar_dict[0]['id'] if grammar_id == 'new': name = grammar_dict[0]['text'] ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammar = Grammar(name=name, json='', author=request.user, dataset=Dataset.objects.filter(index=dataset, mapping=mapping)[0]) grammar.save() grammar_dict[0]['id'] = grammar.id else: grammar = Grammar.objects.get(id=grammar_id) grammar.json = json.dumps(grammar_dict) grammar.save() return HttpResponse(json.dumps({'id':grammar.id}))
def get_table_data(request): query_data = {} query_data['search_id'] = request.GET['search_id'] query_data['polarity'] = request.GET['polarity'] query_data['requested_page'] = int(request.GET['iDisplayStart'])/int(request.GET['iDisplayLength'])+1 query_data['page_length'] = int(request.GET['iDisplayLength']) if request.GET['is_test'] == 'true': query_data['inclusive_metaquery'] = json.loads(request.GET['inclusive_test_grammar']) query_data['inclusive_grammar_id'] = -1 query_data['exclusive_grammar_id'] = -1 query_data['features'] = sorted(extract_layers(query_data['inclusive_metaquery'])) else: query_data['inclusive_grammar_id'] = request.GET['inclusive_grammar_id'] query_data['exclusive_grammar_id'] = request.GET['exclusive_grammar_id'] query_data['inclusive_metaquery'] = generate_metaquery_dict(int(query_data['inclusive_grammar_id']), request.user, component={}) query_data['exclusive_metaquery'] = generate_metaquery_dict(int(query_data['exclusive_grammar_id']), request.user, component={}) query_data['features'] = sorted(extract_layers(query_data['inclusive_metaquery']) | extract_layers(query_data['exclusive_metaquery'])) GrammarPageMapping.objects.filter(search_id=query_data['search_id'], inclusive_grammar=query_data['inclusive_grammar_id'], exclusive_grammar=query_data['exclusive_grammar_id'], polarity=query_data['polarity'], author=request.user).delete() ds = Datasets().activate_dataset(request.session) query_data['dataset'] = ds.get_index() query_data['mapping'] = ds.get_mapping() component_query = ElasticGrammarQuery(query_data['inclusive_metaquery'], None).generate() es_m = ds.build_manager(ES_Manager) if query_data['search_id'] != '-1': saved_query = json.loads(Search.objects.get(pk=query_data['search_id']).query) es_m.load_combined_query(saved_query) if query_data['polarity'] == 'positive': es_m.merge_combined_query_with_query_dict(component_query) else: #es_m.combined_query = {"main": {"query": {"bool": {"should": [{"match_all":{}}], "must": [], "must_not": []}}}, #"facts": {"include": [], 'total_include': 0, #"exclude": [], 'total_exclude': 0}} es_m.combined_query = {"main": {"query":{"match_all":{}}}} if query_data['polarity'] == 'positive': es_m.combined_query = component_query # Add paging data to the query #es_m.set_query_parameter('from', request.session['grammar_'+polarity+'_cursor']) es_m.set_query_parameter('size', request.GET['iDisplayLength']) es_m.set_query_parameter('_source', query_data['features']) query_data['inclusive_instructions'] = generate_instructions(query_data['inclusive_metaquery']) query_data['exclusive_instructions'] = {} #generate_instructions(query_data['exclusive_metaquery']) data = scroll_data(es_m.combined_query['main'], request, query_data) data['sEcho'] = request.GET['sEcho'] return HttpResponse(json.dumps(data,ensure_ascii=False))
def facts_agg(es_params, request): logger = LogManager(__name__, 'FACTS AGGREGATION') distinct_values = [] query_results = [] lexicon = [] aggregation_data = es_params['aggregate_over'] aggregation_data = json.loads(aggregation_data) original_aggregation_field = aggregation_data['path'] aggregation_field = 'texta_link.facts' try: aggregation_size = 50 aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}}, "distinct_values": {"cardinality": {"field": aggregation_field}}} # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() date_range = ds.get_date_range() es_m = ES_Manager(dataset, mapping, date_range) for item in es_params: if 'saved_search' in item: s = Search.objects.get(pk=es_params[item]) name = s.description saved_query = json.loads(s.query) es_m.load_combined_query(saved_query) es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':name,'data':normalised_counts,'labels':labels}) distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']}) es_m.build(es_params) # FIXME # this is confusing for the user if not es_m.is_combined_query_empty(): es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':'Query','data':normalised_counts,'labels':labels}) distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']}) data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))] data = [['Word']+[query_result['name'] for query_result in query_results]]+data for i,word in enumerate(lexicon): for j,query_result in enumerate(query_results): for k,label in enumerate(query_result['labels']): if word == label: data[i+1][j+1] = query_result['data'][k] logger.set_context('user_name', request.user.username) logger.info('facts_aggregation_queried') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.exception('facts_aggregation_query_failed') table_height = len(data)*15 table_height = table_height if table_height > 500 else 500 return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}
def get_all_matched_rows(query, request, inclusive_instructions): buffer_ = StringIO() writer = csv.writer(buffer_) ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) features = sorted([field['path'] for field in es_m.get_mapped_fields()]) query['size'] = ES_SCROLL_BATCH writer.writerow(features) ds.get_index() ds.get_mapping() es_url request_url = os.path.join(es_url, ds.get_index(), ds.get_mapping(), '_search?scroll=1m') response = requests.get(request_url, data=json.dumps(query)).json() scroll_id = response['_scroll_id'] hits = response['hits']['hits'] scroll_payload = json.dumps({'scroll': '1m', 'scroll_id': scroll_id}) while hits: for hit in hits: feature_dict = { feature_name: hit['_source'][feature_name] for feature_name in hit['_source'] } feature_dict = {} row = [] for feature_name in features: feature_path = feature_name.split('.') parent_source = hit['_source'] for path_component in feature_path: if path_component in parent_source: parent_source = parent_source[path_component] else: parent_source = "" break content = parent_source row.append(content) feature_dict[feature_name] = content layer_dict = matcher.LayerDict(feature_dict) if inclusive_instructions.match(layer_dict): writer.writerow([ element.encode('utf-8') if isinstance(element, unicode) else element for element in row ]) buffer_.seek(0) data = buffer_.read() buffer_.seek(0) buffer_.truncate() yield data response = requests.get(os.path.join(es_url, '_search', 'scroll'), data=scroll_payload).json() hits = response['hits']['hits'] scroll_id = response['_scroll_id']
def get_table_data(request): query_data = {} query_data['search_id'] = request.GET['search_id'] query_data['polarity'] = request.GET['polarity'] query_data['requested_page'] = int(request.GET['iDisplayStart']) / int( request.GET['iDisplayLength']) + 1 query_data['page_length'] = int(request.GET['iDisplayLength']) if request.GET['is_test'] == 'true': query_data['inclusive_metaquery'] = json.loads( request.GET['inclusive_test_grammar']) query_data['inclusive_grammar_id'] = -1 query_data['exclusive_grammar_id'] = -1 query_data['features'] = sorted( extract_layers(query_data['inclusive_metaquery'])) else: query_data['inclusive_grammar_id'] = request.GET[ 'inclusive_grammar_id'] query_data['exclusive_grammar_id'] = request.GET[ 'exclusive_grammar_id'] query_data['inclusive_metaquery'] = generate_metaquery_dict( int(query_data['inclusive_grammar_id']), request.user, component={}) query_data['exclusive_metaquery'] = generate_metaquery_dict( int(query_data['exclusive_grammar_id']), request.user, component={}) query_data['features'] = sorted( extract_layers(query_data['inclusive_metaquery']) | extract_layers(query_data['exclusive_metaquery'])) GrammarPageMapping.objects.filter( search_id=query_data['search_id'], inclusive_grammar=query_data['inclusive_grammar_id'], exclusive_grammar=query_data['exclusive_grammar_id'], polarity=query_data['polarity'], author=request.user).delete() ds = Datasets().activate_dataset(request.session) query_data['dataset'] = ds.get_index() query_data['mapping'] = ds.get_mapping() component_query = ElasticGrammarQuery(query_data['inclusive_metaquery'], None).generate() es_m = ds.build_manager(ES_Manager) if query_data['search_id'] != '-1': saved_query = json.loads( Search.objects.get(pk=query_data['search_id']).query) es_m.load_combined_query(saved_query) if query_data['polarity'] == 'positive': es_m.merge_combined_query_with_query_dict(component_query) else: #es_m.combined_query = {"main": {"query": {"bool": {"should": [{"match_all":{}}], "must": [], "must_not": []}}}, #"facts": {"include": [], 'total_include': 0, #"exclude": [], 'total_exclude': 0}} es_m.combined_query = {"main": {"query": {"match_all": {}}}} if query_data['polarity'] == 'positive': es_m.combined_query = component_query # Add paging data to the query #es_m.set_query_parameter('from', request.session['grammar_'+polarity+'_cursor']) es_m.set_query_parameter('size', request.GET['iDisplayLength']) es_m.set_query_parameter('_source', query_data['features']) query_data['inclusive_instructions'] = generate_instructions( query_data['inclusive_metaquery']) query_data['exclusive_instructions'] = { } #generate_instructions(query_data['exclusive_metaquery']) data = scroll_data(es_m.combined_query['main'], request, query_data) data['sEcho'] = request.GET['sEcho'] return HttpResponse(json.dumps(data, ensure_ascii=False))
def find_mappings(request): try: slop = int(request.POST['slop']) max_len = int(request.POST['max_len']) min_len = int(request.POST['min_len']) min_freq = int(request.POST['min_freq']) match_field = request.POST['match_field'] description = request.POST['description'] batch_size = 50 # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() lexicon = [] word_index = {} num_lexicons = 0 for i, lexicon_id in enumerate(request.POST.getlist('lexicons[]')): num_lexicons += 1 for word in Word.objects.filter(lexicon=lexicon_id): word = word.wrd lexicon.append(word) if word not in word_index: word_index[word] = [] word_index[word].append(i) lexicon = list(set(lexicon)) if min_len > num_lexicons: min_len = num_lexicons mwe_counter = 0 group_counter = 0 phrases = [] final = {} data = [] new_run = Run(minimum_frequency=min_freq, maximum_length=max_len, minimum_length=min_len, run_status='running', run_started=datetime.now(), run_completed=None, user=request.user, description=description) new_run.save() logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_started', 'args': { 'user_name': request.user.username, 'run_id': new_run.id, 'slop': slop, 'min_len': min_len, 'max_len': max_len, 'min_freq': min_freq, 'match_field': match_field, 'desc': description } })) for i in range(min_len, max_len + 1): print('Permutation len:', i) for permutation in itertools.permutations(lexicon, i): word_indices = list( flatten([word_index[word] for word in permutation])) if len(word_indices) == len(set(word_indices)): permutation = ' '.join(permutation) if slop > 0: query = { "query": { "match_phrase": { match_field: { "query": permutation, "slop": slop } } } } else: query = { "query": { "match_phrase": { match_field: { "query": permutation } } } } data.append( json.dumps({ "index": dataset, "mapping": mapping }) + '\n' + json.dumps(query)) phrases.append(permutation) if len(data) == batch_size: for j, response in enumerate( ES_Manager.plain_multisearch( es_url, dataset, mapping, data)): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join( sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase( sorted_phrase, request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = { 'total_freq': 0, 'mwes': [], 'display_name': { 'freq': 0, 'label': False }, 'id': group_counter } group_counter += 1 final[sorted_conceptualised_phrase][ 'total_freq'] += response['hits'][ 'total'] final[sorted_conceptualised_phrase][ 'mwes'].append({ 'mwe': phrases[j], 'freq': response['hits']['total'], 'accepted': False, 'id': mwe_counter }) mwe_counter += 1 final[sorted_conceptualised_phrase][ 'mwes'].sort(reverse=True, key=lambda k: k['freq']) if response['hits']['total'] > final[ sorted_conceptualised_phrase][ 'display_name']['freq']: final[sorted_conceptualised_phrase][ 'display_name']['freq'] = response[ 'hits']['total'] final[sorted_conceptualised_phrase][ 'display_name']['label'] = phrases[ j] except KeyError as e: raise e data = [] phrases = [] logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_progress', 'args': { 'user_name': request.user.username, 'run_id': new_run.id }, 'data': { 'permutations_processed': i + 1 - min_len, 'total_permutations': max_len - min_len + 1 } })) m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data) for j, response in enumerate(m_response): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase( sorted_phrase, request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = { 'total_freq': 0, 'mwes': [], 'display_name': { 'freq': 0, 'label': False }, 'id': group_counter } group_counter += 1 final[sorted_conceptualised_phrase][ 'total_freq'] += response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({ 'mwe': phrases[j], 'freq': response['hits']['total'], 'accepted': False, 'id': mwe_counter }) mwe_counter += 1 final[sorted_conceptualised_phrase]['mwes'].sort( reverse=True, key=lambda k: k['freq']) if response['hits']['total'] > final[ sorted_conceptualised_phrase]['display_name'][ 'freq']: final[sorted_conceptualised_phrase]['display_name'][ 'freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name'][ 'label'] = phrases[j] except KeyError as e: raise e for key in final: final[key]['concept_name'] = {'freq': -1, 'label': ''} r = Run.objects.get(pk=new_run.pk) r.run_completed = datetime.now() r.run_status = 'completed' r.results = json.dumps(final) r.save() logging.getLogger(INFO_LOGGER).info( json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_completed', 'args': { 'user_name': request.user.username, 'run_id': new_run.id } })) except Exception as e: print(e) logging.getLogger(ERROR_LOGGER).error(json.dumps({ 'process': 'MINE MWEs', 'event': 'mwe_mining_failed', 'args': { 'user_name': request.user.username, 'run_id': new_run.id } }), exc_info=True)
def find_mappings(request): try: slop = int(request.POST['slop']) max_len = int(request.POST['max_len']) min_len = int(request.POST['min_len']) min_freq = int(request.POST['min_freq']) match_field = request.POST['match_field'] description = request.POST['description'] batch_size = 50 # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() lexicon = [] word_index = {} num_lexicons = 0 for i,lexicon_id in enumerate(request.POST.getlist('lexicons[]')): num_lexicons +=1 for word in Word.objects.filter(lexicon=lexicon_id): word = word.wrd lexicon.append(word) if word not in word_index: word_index[word] = [] word_index[word].append(i) lexicon = list(set(lexicon)) if min_len > num_lexicons: min_len = num_lexicons mwe_counter = 0 group_counter = 0 phrases = [] final = {} data = [] new_run = Run(minimum_frequency=min_freq,maximum_length=max_len,minimum_length=min_len,run_status='running',run_started=datetime.now(),run_completed=None,user=request.user,description=description) new_run.save() logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_started','args':{'user_name':request.user.username,'run_id':new_run.id,'slop':slop,'min_len':min_len,'max_len':max_len,'min_freq':min_freq,'match_field':match_field,'desc':description}})) for i in range(min_len,max_len+1): print('Permutation len:',i) for permutation in itertools.permutations(lexicon,i): word_indices = list(flatten([word_index[word] for word in permutation])) if len(word_indices) == len(set(word_indices)): permutation = ' '.join(permutation) if slop > 0: query = {"query": {"match_phrase": {match_field: {"query": permutation,"slop": slop}}}} else: query = {"query": {"match_phrase": {match_field: {"query": permutation}}}} data.append(json.dumps({"index":dataset,"mapping":mapping})+'\n'+json.dumps(query)) phrases.append(permutation) if len(data) == batch_size: for j,response in enumerate(ES_Manager.plain_multisearch(es_url, dataset, mapping, data)): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter} group_counter+=1 final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter}) mwe_counter+=1 final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq']) if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']: final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j] except KeyError as e: raise e data = [] phrases = [] logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_progress','args':{'user_name':request.user.username,'run_id':new_run.id},'data':{'permutations_processed':i+1-min_len,'total_permutations':max_len-min_len+1}})) m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data) for j,response in enumerate(m_response): try: if response['hits']['total'] >= min_freq: sorted_phrase = ' '.join(sorted(phrases[j].split(' '))) sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user) if sorted_conceptualised_phrase not in final: final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter} group_counter+=1 final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total'] final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter}) mwe_counter+=1 final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq']) if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']: final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total'] final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j] except KeyError as e: raise e for key in final: final[key]['concept_name'] = {'freq':-1,'label':''} r = Run.objects.get(pk=new_run.pk) r.run_completed = datetime.now() r.run_status = 'completed' r.results =json.dumps(final) r.save() logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_completed','args':{'user_name':request.user.username,'run_id':new_run.id}})) except Exception as e: print(e) logging.getLogger(ERROR_LOGGER).error(json.dumps({'process':'MINE MWEs','event':'mwe_mining_failed','args':{'user_name':request.user.username,'run_id':new_run.id}}),exc_info=True)
class FactManager: """ Manage Searcher facts, like deleting/storing, adding facts. """ def __init__(self, request): self.es_params = request.POST self.ds = Datasets().activate_dataset(request.session) self.index = self.ds.get_index() self.mapping = self.ds.get_mapping() self.es_m = ES_Manager(self.index, self.mapping) self.field = 'texta_facts' def remove_facts_from_document(self, rm_facts_dict, bs=7500): '''remove a certain fact from all documents given a [str]key and [str]val''' logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS') try: # Clears readonly block just in case the index has been set to read only self.es_m.clear_readonly_block() query = self._fact_deletion_query(rm_facts_dict) self.es_m.load_combined_query(query) response = self.es_m.scroll(size=bs, field_scroll=self.field) scroll_id = response['_scroll_id'] total_docs = response['hits']['total'] docs_left = total_docs # DEBUG print('Starting.. Total docs - ', total_docs) # DEBUG batch = 0 while total_docs > 0: print('Docs left:', docs_left) # DEBUG data = '' for document in response['hits']['hits']: new_field = [] # The new facts field for fact in document['_source'][self.field]: # If the fact name is in rm_facts_dict keys if fact["fact"] in rm_facts_dict: # If the fact value is not in the delete key values if fact['str_val'] not in rm_facts_dict.getlist( fact["fact"]): new_field.append(fact) else: new_field.append(fact) # Update dataset data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: new_field}} data += json.dumps(document) + '\n' response = self.es_m.scroll(scroll_id=scroll_id, size=bs, field_scroll=self.field) total_docs = len(response['hits']['hits']) docs_left -= bs # DEBUG scroll_id = response['_scroll_id'] self.es_m.plain_post_bulk(self.es_m.es_url, data) print('DONE') # DEBUG logger.set_context('docs_left', total_docs) logger.set_context('batch', batch) logger.info('remove_facts_from_document') except: print(traceback.format_exc()) logger.set_context('es_params', self.es_params) logger.exception('remove_facts_from_document_failed') def tag_documents_with_fact(self, es_params, tag_name, tag_value, tag_field): '''Used to tag all documents in the current search with a certain fact''' self.es_m.build(es_params) self.es_m.load_combined_query(self.es_m.combined_query) response = self.es_m.scroll() data = '' for document in response['hits']['hits']: if 'mlp' in tag_field: split_field = tag_field.split('.') span = [ 0, len(document['_source'][split_field[0]][split_field[1]]) ] else: span = [0, len(document['_source'][tag_field].strip())] document['_source'][self.field].append({ "str_val": tag_value, "spans": str([span]), "fact": tag_name, "doc_path": tag_field }) data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: document['_source'][self.field]}} data += json.dumps(document) + '\n' self.es_m.plain_post_bulk(self.es_m.es_url, data) response = requests.post( '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format( self.es_m.es_url, self.index), headers=self.es_m.HEADERS) def count_cooccurrences(self, fact_pairs): """Finds the counts of cooccuring facts Arguments: fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))] Returns: [int list] -- Occurances of the given facts """ queries = [] for fact_pair in fact_pairs: fact_constraints = [] for fact in fact_pair: constraint = { "nested": { "path": "texta_facts", "query": { "bool": { "must": [{ "term": { "texta_facts.fact": fact[0] } }, { "term": { "texta_facts.str_val": fact[1] } }] } } } } fact_constraints.append(constraint) query = {"query": {"bool": {"must": fact_constraints}}, "size": 0} queries.append(json.dumps(query)) header = json.dumps({"index": self.index}) data = "\n".join(["{0}\n{1}".format(header, q) for q in queries]) + "\n" responses = requests.post("{0}/{1}/_msearch".format( self.es_m.es_url, self.index), data=data, headers={"Content-Type": "application/json"}) counts = [ response["hits"]["total"] for response in responses.json()['responses'] ] return counts def facts_via_aggregation(self, size=15): """Finds all facts from current search. Parameters: size - [int=15] -- Amount of fact values per fact name to search in query Returns: facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}} fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))] unique_fact_names - [list of string] -- All unique fact names """ aggs = { "facts": { "nested": { "path": "texta_facts" }, "aggs": { "fact_names": { "terms": { "field": "texta_facts.fact" }, "aggs": { "fact_values": { "terms": { "field": "texta_facts.str_val", "size": size } } } } } } } self.es_m.build(self.es_params) self.es_m.set_query_parameter('aggs', aggs) response = self.es_m.search() response_aggs = response['aggregations']['facts']['fact_names'][ 'buckets'] facts = {} fact_combinations = [] fact_count = 0 unique_fact_names = [] for bucket in response_aggs: unique_fact_names.append(bucket['key']) for fact in bucket['fact_values']['buckets']: facts[bucket['key'] + " - " + fact['key']] = { 'id': fact_count, 'name': bucket['key'], 'value': fact['key'], 'doc_count': fact['doc_count'] } fact_combinations.append((bucket['key'], fact['key'])) fact_count += 1 fact_combinations = [ x for x in itertools.combinations(fact_combinations, 2) ] return (facts, fact_combinations, unique_fact_names) def fact_graph(self, search_size): facts, fact_combinations, unique_fact_names = self.facts_via_aggregation( size=search_size) # Get cooccurrences and remove values with 0 fact_combinations = { k: v for k, v in dict( zip(fact_combinations, self.count_cooccurrences(fact_combinations))).items() if v != 0 } shapes = [ "circle", "cross", "diamond", "square", "triangle-down", "triangle-up" ] types = dict(zip(unique_fact_names, itertools.cycle(shapes))) nodes = [] for i, fact in enumerate(facts): nodes.append({ "source": facts[fact]['id'], "size": facts[fact]['doc_count'], "score": facts[fact]['doc_count'], "name": facts[fact]['name'], "id": facts[fact]['value'], "type": types[facts[fact]['name']] }) # Track max/min count count = facts[fact]['doc_count'] if i == 0: max_node_size = count min_node_size = count max_node_size = max(max_node_size, count) min_node_size = min(min_node_size, count) links = [] max_link_size = 0 for fact in fact_combinations.keys(): max_link_size = max(max_link_size, fact_combinations[fact]) links.append({ "source": facts[fact[0][0] + " - " + fact[0][1]]['id'], "target": facts[fact[1][0] + " - " + fact[1][1]]['id'], "count": fact_combinations[fact] }) graph_data = json.dumps({"nodes": nodes, "links": links}) return (graph_data, unique_fact_names, max_node_size, max_link_size, min_node_size) def _fact_deletion_query(self, rm_facts_dict): '''Creates the query for fact deletion based on dict of facts {name: val}''' fact_queries = [] for key in rm_facts_dict: for val in rm_facts_dict.getlist(key): fact_queries.append({ "bool": { "must": [{ "match": { self.field + ".fact": key } }, { "match": { self.field + ".str_val": val } }] } }) query = { "main": { "query": { "nested": { "path": self.field, "query": { "bool": { "should": fact_queries } } } }, "_source": [self.field] } } return query