def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """Apply Torch Tagger to index.""" try: tagger_object = TorchTaggerObject.objects.get(pk=object_id) tagger = tagger_object.load_tagger() progress = ShowProgress(tagger_object.task) ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices = indices, field_data = fields + ["texta_facts"], # Get facts to add upon existing ones. query = query, output = ElasticSearcher.OUT_RAW, timeout = f"{es_timeout}m", callback_progress=progress, scroll_size = bulk_size ) actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger) for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info)) tagger_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED)
def test_index_processing(self): query_string = "inimene" payload = { "description": "TestingIndexProcessing", "fields": [TEST_FIELD], "query": json.dumps( {'query': { 'match': { 'comment_content_lemmas': query_string } }}, ensure_ascii=False) } response = self.client.post(self.url, data=payload, format="json") print_output("test_index_processing:response.data", response.data) # Check if MLP was applied to the documents properly. s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=payload["query"]) for hit in s: self._assert_mlp_contents(hit, TEST_FIELD)
def test_that_split_index_with_nested_field_still_has_nested_field(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, data=payload, format="json") print_output( 'test_that_split_index_with_nested_field_still_has_nested_field:response.data', response.data) at_least_once = False es = ElasticSearcher( indices=[INDEX_SPLITTING_TEST_INDEX, INDEX_SPLITTING_TEST_INDEX], field_data=[TEST_INDEX_OBJECT_FIELD], flatten=False) for item in es: data = item.get(TEST_INDEX_OBJECT_FIELD, None) if data: self.assertTrue(isinstance(data, dict)) at_least_once = True self.assertTrue(at_least_once)
def fact_delete_query_task(self, worker_id: int): worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step( 'Scrolling through the indices to delete the facts.') # Get the necessary fields. indices: List[str] = worker_object.get_indices() target_facts = json.loads(worker_object.facts) scroll_size = worker_object.scroll_size searcher = ElasticSearcher( query=json.loads(worker_object.query), indices=indices, field_data=[TEXTA_TAGS_KEY], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=f"{worker_object.es_timeout}m") ed = ElasticDocument(index=None) actions = query_delete_actions_generator(searcher, target_facts) ed.bulk_update(actions) worker_object.task.complete() worker_object.save() return worker_id except Exception as e: worker_object.task.handle_failed_task(e) raise e
def _initialize_es(self, project_pk, text_processor, callback_progress, prediction_to_match): # create es doc es_doc = ElasticDocument(self.feedback_index) # if no model objects, return nones for query and search if not self.model_object: return es_doc, None, None # create mathing query query = Query() query.add_string_filter(query_string=self.model_object.MODEL_TYPE, fields=["model_type"]) if self.model_object: query.add_string_filter(query_string=str(self.model_object.pk), fields=["model_id"]) if prediction_to_match: query.add_string_filter(query_string=prediction_to_match, fields=["correct_result"]) # if no index, don't create searcher object if not self.check_index_exists(): return es_doc, None, query.query # create es search es_search = ElasticSearcher(indices=self.feedback_index, query=query.query, text_processor=text_processor, output=ElasticSearcher.OUT_DOC_WITH_ID, callback_progress=callback_progress) # return objects return es_doc, es_search, query.query
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() if not tagger_object.model.path: return Response({'error': 'model does not exist (yet?)'}, status=status.HTTP_400_BAD_REQUEST) serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = tagger_object.get_available_or_all_indices(indices) # retrieve tagger fields tagger_fields = json.loads(tagger_object.fields) if not ElasticCore().check_if_indices_exist(indices): return Response( { 'error': f'One or more index from {list(indices)} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] # filter out correct fields from the document random_doc_filtered = { k: v for k, v in random_doc.items() if k in tagger_fields } # apply tagger tagger_response = apply_tagger(tagger_object.id, random_doc_filtered, input_type='doc') response = {"document": random_doc, "prediction": tagger_response} return Response(response, status=status.HTTP_200_OK)
def _get_split_documents_by_id(self, id_field, id_value, text_field): documents = [] query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict() es = ElasticSearcher(query=query, field_data=[id_field, text_field], output=ElasticSearcher.OUT_RAW) for hit in es: for document in hit: documents.append(document) return documents
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object: RegexTaggerGroup = self.get_object() serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = project_object.get_available_or_all_project_indices(indices) # retrieve tagger fields fields = serializer.validated_data["fields"] if not ElasticCore().check_if_indices_exist( tagger_object.project.get_indices()): return Response( { 'error': f'One or more index from {list(tagger_object.project.get_indices())} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply tagger results = { "tagger_group_id": tagger_object.pk, "tagger_group_tag": tagger_object.description, "result": False, "matches": [], "document": flattened_doc } final_matches = [] for field in fields: text = flattened_doc.get(field, None) results["document"][field] = text matches = tagger_object.match_texts([text], as_texta_facts=True, field=field) if matches: final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): """Simplified search interface for making Elasticsearch queries.""" serializer = ProjectSimplifiedSearchSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) project_object = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project_object) project_indices = list(project_object.get_indices()) project_fields = project_object.get_elastic_fields(path_list=True) # test if indices exist if not project_indices: raise ProjectValidationFailed(detail="Project has no indices") # test if indices are valid if serializer.validated_data['match_indices']: if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)): raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}") # test if fields are valid if serializer.validated_data['match_fields']: if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)): raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}") es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC) q = Query(operator=serializer.validated_data['operator']) # if input is string, convert to list # if unknown format, return error match_text = serializer.validated_data['match_text'] if isinstance(match_text, list): match_texts = [str(item) for item in match_text if item] elif isinstance(match_text, str): match_texts = [match_text] else: return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST) # add query filters for item in match_texts: q.add_string_filter(item, match_type=serializer.validated_data["match_type"]) # update query es.update_query(q.query) # retrieve results results = es.search(size=serializer.validated_data["size"]) return Response(results, status=status.HTTP_200_OK)
def test_create_splitter_object_and_task_signal(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, json.dumps(payload), content_type='application/json') print_output( 'test_create_splitter_object_and_task_signal:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) print_output("indices:", splitter_obj.get_indices()) # Check if IndexSplitter object gets created self.assertEqual(response.status_code, status.HTTP_201_CREATED) # Check if Task gets created self.assertTrue(splitter_obj.task is not None) print_output("status of IndexSplitter's Task object", splitter_obj.task.status) # Check if Task gets completed self.assertEqual(splitter_obj.task.status, Task.STATUS_COMPLETED) sleep(5) original_count = ElasticSearcher(indices=self.test_index_name).count() test_count = ElasticSearcher( indices=INDEX_SPLITTING_TEST_INDEX).count() train_count = ElasticSearcher( indices=INDEX_SPLITTING_TRAIN_INDEX).count() print_output('original_count, test_count, train_count', [original_count, test_count, train_count])
class Autocomplete: def __init__(self, project, indices, limit = 10): self.project = project self.limit = limit self.es = ElasticSearcher(output=ElasticSearcher.OUT_RAW, indices=indices) def get_fact_names(self, startswith): query = {"aggs": {'fact': {"nested": {"path": "texta_facts"}, "aggs": {'fact': {"terms": {"field": "texta_facts.fact", "size": self.limit, "include": f"{startswith}.*"}}}}}} self.es.update_query(query) results = self.es.search() facts = [a['key'] for a in results['aggregations']['fact']['fact']['buckets']] return facts def get_fact_values(self, startswith, fact_name): query = {"aggs": {'str_val': {"nested": {"path": "texta_facts"}, "aggs": {'str_val': {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": f"{startswith}.*"}}}}}}}} self.es.update_query(query) results = self.es.search() facts = [] for bucket in results['aggregations']['str_val']['str_val']['buckets']: if bucket['key'] == fact_name: facts += [sub_bucket['key'] for sub_bucket in bucket['fact_values']['buckets']] return facts def get_lexicons(self, startswith): # TODO pass
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore): index_elastic_search = ElasticSearcher( indices=indices, field_data=index_fields, callback_progress=show_progress, query=query, output=ElasticSearcher.OUT_RAW, scroll_size=scroll_size ) index_actions = add_doc_uuid(generator=index_elastic_search) for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def test_create_random_split(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, data=payload) print_output('test_create_random_split:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) # Assert Task gets completed self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED) print_output("Task status", Task.STATUS_COMPLETED) sleep(5) original_count = ElasticSearcher(indices=self.test_index_name).count() test_count = ElasticSearcher( indices=INDEX_SPLITTING_TEST_INDEX).count() train_count = ElasticSearcher( indices=INDEX_SPLITTING_TRAIN_INDEX).count() print_output('original_count, test_count, train_count', [original_count, test_count, train_count]) # To avoid any inconsistencies caused by rounding assume sizes are between small limits self.assertTrue(self.is_between_limits(test_count, original_count, 0.2)) self.assertTrue( self.is_between_limits(train_count, original_count, 0.8))
def test_applying_lang_detect_with_query(self): mlp_field = f"{TEST_FIELD}_mlp" query_string = "inimene" payload = { "description": "TestingIndexProcessing", "field": TEST_FIELD, "query": json.dumps({'query': {'match': {'comment_content_lemmas': query_string}}}, ensure_ascii=False) } response = self.client.post(self.url, data=payload, format="json") print_output("test_applying_lang_detect_with_query:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"])) for hit in s: if TEST_FIELD in hit: self.assertTrue(f"{mlp_field}.language.detected" in hit) lang_value = hit[f"{mlp_field}.language.detected"] self.assertTrue(lang_value == "et")
def apply_crf_extractor_to_index(object_id: int, indices: List[str], mlp_fields: List[str], label_suffix: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """ Applies Extractor to ES index. """ try: # load model crf_object = CRFExtractorObject.objects.get(pk=object_id) extractor = crf_object.load_extractor() # progress progress = ShowProgress(crf_object.task) # add fact field if missing ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] # search searcher = ElasticSearcher( indices=indices, field_data=mlp_fields + ["texta_facts"], # Get facts to add upon existing ones. query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=bulk_size) # create update actions actions = update_generator(generator=searcher, ec=ec, mlp_fields=mlp_fields, label_suffix=label_suffix, object_id=object_id, extractor=extractor) # perform updates try: # as we have defined indices in actions there is no need to do it again (None) ElasticDocument(None).bulk_update(actions) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) # all done crf_object.task.complete() return True except Exception as e: crf_object.task.handle_failed_task(e) raise e
def extract_from_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get rakun object rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk) serializer = RakunExtractorRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [index["name"] for index in serializer.validated_data["indices"]] indices = project_object.get_available_or_all_project_indices(indices) # retrieve rakun fields fields = serializer.validated_data["fields"] # retrieve param add_spans add_spans = serializer.validated_data["add_spans"] # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply rakun results = { "rakun_id": rakun_object.pk, "description": rakun_object.description, "result": False, "keywords": [], "document": flattened_doc } final_keywords = [] keyword_detector = rakun_object.load_rakun_keyword_detector() for field in fields: text = flattened_doc.get(field, "") results["document"][field] = text keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans) if keywords: final_keywords.extend(keywords) results["result"] = True results["keywords"] = final_keywords return Response(results, status=status.HTTP_200_OK)
def test_processing_with_just_tokenizer(self): payload = { "description": "hello there, kenobi.", "fields": [TEST_FIELD], "analyzers": ["tokenizer"], "indices": [{ "name": self.test_index_name }], "query": json.dumps(TEST_QUERY, ensure_ascii=False) } response = self.client.post(self.list_url, data=payload, format="json") print_output("test_processing_with_just_tokenizer:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name], query=TEST_QUERY) for hit in s: new_field = f'{TEST_FIELD}_es.tokenized_text' self.assertTrue(new_field in hit) self.assertTrue(hit[new_field] != hit[TEST_FIELD])
def _get_negatives(self, size): self.show_progress.update_step("scrolling negative sample") self.show_progress.update_view(0) # iterator for retrieving negative examples negative_sample_iterator = ElasticSearcher( indices=self.indices, field_data=self.field_data, output=ElasticSearcher.OUT_DOC, callback_progress=self.show_progress, text_processor=self.text_processor, scroll_limit=int(size * float(self.tagger_object.negative_multiplier)), ignore_ids=self.ignore_ids, ) # iterator to list negative_sample = list(negative_sample_iterator) # document doct to value string if asked if self.join_fields: negative_sample = self._join_fields(negative_sample) return negative_sample
def test_normal_process_application(self): payload = { "description": "hello there, kenobi.", "analyzers": ["stemmer"], "fields": [TEST_FIELD], "stemmer_lang": "estonian", "indices": [{ "name": self.test_index_name }] } response = self.client.post(self.list_url, data=payload, format="json") print_output("test_normal_process_application:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name]) for hit in s: new_field = f'{TEST_FIELD}_es.stems' self.assertTrue(new_field in hit) self.assertTrue(hit[new_field] != hit[TEST_FIELD]) break
def apply_lang_on_indices(self, apply_worker_id: int): worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id) task_object = worker_object.task try: load_mlp() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() field = worker_object.field scroll_size = 100 searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=[field], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="15m") for index in indices: searcher.core.add_texta_facts_mapping(index=index) actions = process_lang_actions(generator=searcher, field=field, worker_id=apply_worker_id, mlp_class=mlp) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) worker_object.task.complete() return apply_worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def _get_class_sample(self, query, class_name): """Returns sample for given class""" # limit the docs according to max sample size & feedback size limit = int(self.tagger_object.maximum_sample_size) if class_name in self.feedback: limit = limit - len(self.feedback[class_name]) logging.getLogger(INFO_LOGGER).info( f"Collecting examples for class {self.class_display_name} (max limit = {limit})..." ) # iterator for retrieving positive sample by query positive_sample_iterator = ElasticSearcher( query=query, indices=self.indices, field_data=self.field_data, output=ElasticSearcher.OUT_DOC_WITH_ID, callback_progress=self.show_progress, scroll_limit=limit, text_processor=self.text_processor) positive_sample = [] # set positive ids to ignore while scrolling for negatives for doc in positive_sample_iterator: self.ignore_ids.add(doc["_id"]) # remove id from doc del doc["_id"] positive_sample.append(doc) logging.getLogger(INFO_LOGGER).info( f"Found {len(positive_sample)} examples for {self.class_display_name}..." ) # If class balancing is enabled, modify number of required samples if self.balance: positive_sample = self._duplicate_examples(positive_sample, class_name, limit) # document doct to value string if asked if self.join_fields: positive_sample = self._join_fields(positive_sample) return positive_sample
def apply_summarizer_on_index(self, summarizer_id: int): summarizer_object = Summarizer.objects.get(pk=summarizer_id) task_object = summarizer_object.task try: load_sumy() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling summarizer') # Get the necessary fields. indices: List[str] = summarizer_object.get_indices() field_data: List[str] = json.loads(summarizer_object.fields) ratio_data: float[str] = summarizer_object.ratio algorithm_data: List[str] = summarizer_object.algorithm scroll_size = 100 searcher = ElasticSearcher(query=json.loads(summarizer_object.query), indices=indices, field_data=field_data, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="30m") actions = process_actions(searcher, field_data, ratio_data, algorithm=algorithm_data, summarizer_class=sumy, summarizer_id=summarizer_id) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return summarizer_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def test_that_lang_detect_enters_nan_token_on_bogus_fields(self): # Set up the index with the target document that ensures NAN response. ec = ElasticCore() query_string = 159784984949 document_id = "test_that_lang_detect_enters_nan_token_on_bogus_fields" ec.es.index(index=self.test_index_name, id=document_id, body={TEST_FIELD: query_string}, refresh="wait_for") payload = { "description": "TestingIndexProcessing", "field": TEST_FIELD, "query": json.dumps({'query': {'match': {TEST_FIELD: query_string}}}, ensure_ascii=False) } response = self.client.post(self.url, data=payload, format="json") print_output("test_that_lang_detect_enters_nan_token_on_bogus_fields:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"])) for hit in s: self.assertTrue(hit[f"{TEST_FIELD}_mlp.language.detected"] == NAN_LANGUAGE_TOKEN_KEY) break # Clean up the document from the index. ec.es.delete(index=self.test_index_name, id=document_id, refresh="wait_for")
def test_automatic_lang_detection_process(self): payload = { "description": "hello there, kenobi.", "fields": [TEST_FIELD], "analyzers": ["stemmer", "tokenizer"], "detect_lang": True, "indices": [{ "name": self.test_index_name }] } response = self.client.post(self.list_url, data=payload, format="json") print_output("test_automatic_lang_detection_process:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name]) for hit in s: fields = [ f'{TEST_FIELD}_es.tokenized_text', f'{TEST_FIELD}_es.stems' ] self.assertTrue(all([field in hit for field in fields])) self.assertTrue( all(hit[field] != hit[TEST_FIELD] for field in fields)) break
def post(self, request, project_pk: int): """Executes **raw** Elasticsearch query on all project indices.""" project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectSearchByQuerySerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") es = None if serializer.validated_data["output_type"]: es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"]) else: es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS) es.update_query(serializer.validated_data["query"]) results = es.search() return Response(results, status=status.HTTP_200_OK)
def evaluate_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting evaluator task for Evaluator with ID {object_id}.") evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) # Retreieve facts and sklearn average function from the model true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact true_fact_value = evaluator_object.true_fact_value pred_fact_value = evaluator_object.predicted_fact_value average = evaluator_object.average_function add_individual_results = evaluator_object.add_individual_results searcher = ElasticSearcher(indices=indices, field_data=["texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Binary if true_fact_value and pred_fact_value: logging.getLogger(INFO_LOGGER).info( f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})." ) # Set the evaluation type in the model evaluator_object.evaluation_type = "binary" true_set = {true_fact_value, "other"} pred_set = {pred_fact_value, "other"} classes = ["other", true_fact_value] n_total_classes = len(classes) # Multilabel/multiclass else: logging.getLogger(INFO_LOGGER).info( f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'." ) # Make deepcopy of the query to avoid modifying Searcher's query. es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) # Get all fact values corresponding to true and predicted facts to construct total set of labels # needed for confusion matrix, individual score calculations and memory imprint calculations true_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=true_fact) pred_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=pred_fact) true_set = set(true_fact_values) pred_set = set(pred_fact_values) classes = list(true_set.union(pred_set)) n_total_classes = len(classes) # Add dummy classes for missing labels classes.extend( [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL]) ## Set the evaluation type in the model evaluator_object.evaluation_type = "multilabel" classes.sort(key=lambda x: x[0].lower()) # Get number of documents in the query to estimate memory imprint n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() logging.getLogger(INFO_LOGGER).info( f"Number of documents: {n_docs} | Number of classes: {len(classes)}" ) # Get the memory buffer value from core variables core_memory_buffer_value_gb = get_core_setting( "TEXTA_EVALUATOR_MEMORY_BUFFER_GB") # Calculate the value based on given ratio if the core variable is empty memory_buffer_gb = calculate_memory_buffer( memory_buffer=core_memory_buffer_value_gb, ratio=EVALUATOR_MEMORY_BUFFER_RATIO, unit="gb") required_memory = get_memory_imprint( n_docs=n_docs, n_classes=len(classes), eval_type=evaluator_object.evaluation_type, unit="gb", int_size=64) enough_memory = is_enough_memory_available( required_memory=required_memory, memory_buffer=memory_buffer_gb, unit="gb") # Enable scoring after each scroll if there isn't enough memory # for calculating the scores for the whole set of documents at once. score_after_scroll = False if enough_memory else True # If scoring after each scroll is enabled and scores are averaged after each scroll # the results for each averaging function besides `micro` are imprecise scores_imprecise = True if (score_after_scroll and average != "micro") else False # Store document counts, labels' class counts and indicatior if scores are imprecise evaluator_object.document_count = n_docs evaluator_object.n_true_classes = len(true_set) evaluator_object.n_predicted_classes = len(pred_set) evaluator_object.n_total_classes = n_total_classes evaluator_object.scores_imprecise = scores_imprecise evaluator_object.score_after_scroll = score_after_scroll # Save model updates evaluator_object.save() logging.getLogger(INFO_LOGGER).info( f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}" ) # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) # Scroll and score tags scores, bin_scores = scroll_and_score( generator=searcher, evaluator_object=evaluator_object, true_fact=true_fact, pred_fact=pred_fact, true_fact_value=true_fact_value, pred_fact_value=pred_fact_value, classes=classes, average=average, score_after_scroll=score_after_scroll, n_batches=n_batches, add_individual_results=add_individual_results) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() confusion = scores["confusion_matrix"] confusion = np.asarray(confusion, dtype="int64") if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES: # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix confusion, classes = delete_empty_rows_and_cols(confusion, classes) scores["confusion_matrix"] = confusion.tolist() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) # Add final scores to the model evaluator_object.precision = scores["precision"] evaluator_object.recall = scores["recall"] evaluator_object.f1_score = scores["f1_score"] evaluator_object.accuracy = scores["accuracy"] evaluator_object.confusion_matrix = json.dumps( scores["confusion_matrix"]) evaluator_object.individual_results = json.dumps( remove_not_found(bin_scores), ensure_ascii=False) evaluator_object.add_misclassified_examples = False evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)
def tag_random_doc(self, request, pk=None, project_pk=None): """ API endpoint for tagging a random document. """ logging.getLogger(INFO_LOGGER).info( f"[Tag Random doc] Starting tag_random_doc...") # get hybrid tagger object hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # retrieve tagger fields from the first object first_tagger = hybrid_tagger_object.taggers.first() tagger_fields = json.loads(first_tagger.fields) # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable( 'Redis not available. Check if Redis is running.') serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = first_tagger.get_available_or_all_indices(indices) if not ElasticCore().check_if_indices_exist(indices): return Response( { 'error': f'One or more index from {list(indices)} does not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] # filter out correct fields from the document random_doc_filtered = { k: v for k, v in random_doc.items() if k in tagger_fields } tagger_group_id = self.get_object().pk # combine document field values into one string combined_texts = '\n'.join(random_doc_filtered.values()) combined_texts, tags = get_mlp(tagger_group_id, combined_texts, lemmatize=False) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, combined_texts, ignore_tags=tags) # get tags tags += apply_tagger_group(tagger_group_id, random_doc_filtered, tag_candidates, request, input_type='doc') # return document with tags response = {"document": random_doc, "tags": tags} return Response(response, status=status.HTTP_200_OK)
def evaluate_entity_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting entity evaluator task for Evaluator with ID {object_id}." ) evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact add_misclassified_examples = evaluator_object.add_misclassified_examples token_based = evaluator_object.token_based # If the user hasn't defined a field, retrieve it automatically if not evaluator_object.field: es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) true_fact_doc_paths = es_aggregator.facts_abstract( key_field="fact", value_field="doc_path", filter_by_key=true_fact) doc_path = true_fact_doc_paths[0] else: doc_path = evaluator_object.field searcher = ElasticSearcher(indices=indices, field_data=[doc_path, "texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Get number of documents n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() evaluator_object.document_count = n_docs evaluator_object.scores_imprecise = False evaluator_object.score_after_scroll = False evaluator_object.add_individual_results = False # Save model updates evaluator_object.save() # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) scores, misclassified = scroll_and_score_entity( searcher, evaluator_object, true_fact, pred_fact, doc_path, token_based, n_batches, add_misclassified_examples) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" classes = ["other", true_fact] evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)