def tag_doc(self, request, pk=None, project_pk=None): serializer = RegexTaggerGroupTagDocumentSerializer(data=request.data) serializer.is_valid(raise_exception=True) tagger_object: RegexTagger = self.get_object() input_document = serializer.validated_data["doc"] fields = serializer.validated_data["fields"] # apply tagger results = { "tagger_id": tagger_object.pk, "tag": tagger_object.description, "result": False, "matches": [] } final_matches = [] for field in fields: flattened_doc = ElasticCore( check_connection=False).flatten(input_document) text = flattened_doc.get(field, None) matches = tagger_object.match_texts([text], as_texta_facts=False) if matches: for match in matches: match.update(field=field) final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object: RegexTaggerGroup = self.get_object() serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = project_object.get_available_or_all_project_indices(indices) # retrieve tagger fields fields = serializer.validated_data["fields"] if not ElasticCore().check_if_indices_exist( tagger_object.project.get_indices()): return Response( { 'error': f'One or more index from {list(tagger_object.project.get_indices())} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply tagger results = { "tagger_group_id": tagger_object.pk, "tagger_group_tag": tagger_object.description, "result": False, "matches": [], "document": flattened_doc } final_matches = [] for field in fields: text = flattened_doc.get(field, None) results["document"][field] = text matches = tagger_object.match_texts([text], as_texta_facts=True, field=field) if matches: final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def extract_from_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get rakun object rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk) serializer = RakunExtractorRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [index["name"] for index in serializer.validated_data["indices"]] indices = project_object.get_available_or_all_project_indices(indices) # retrieve rakun fields fields = serializer.validated_data["fields"] # retrieve param add_spans add_spans = serializer.validated_data["add_spans"] # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply rakun results = { "rakun_id": rakun_object.pk, "description": rakun_object.description, "result": False, "keywords": [], "document": flattened_doc } final_keywords = [] keyword_detector = rakun_object.load_rakun_keyword_detector() for field in fields: text = flattened_doc.get(field, "") results["document"][field] = text keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans) if keywords: final_keywords.extend(keywords) results["result"] = True results["keywords"] = final_keywords return Response(results, status=status.HTTP_200_OK)
def tag_docs(self, fields: List[str], docs: List[dict]): # apply tagger for doc in docs: for field in fields: flattened_doc = ElasticCore( check_connection=False).flatten(doc) text = flattened_doc.get(field, None) matches_as_facts = self.match_texts([text], as_texta_facts=True, field=field) for fact in matches_as_facts: fact.update(fact=self.description) pre_existing_facts = doc.get(TEXTA_TAGS_KEY, []) filtered_facts = ElasticDocument.remove_duplicate_facts( pre_existing_facts + matches_as_facts) doc[TEXTA_TAGS_KEY] = filtered_facts return docs