Пример #1
0
    def tag_doc(self, request, pk=None, project_pk=None):
        serializer = RegexTaggerGroupTagDocumentSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)
        tagger_object: RegexTagger = self.get_object()

        input_document = serializer.validated_data["doc"]
        fields = serializer.validated_data["fields"]

        # apply tagger
        results = {
            "tagger_id": tagger_object.pk,
            "tag": tagger_object.description,
            "result": False,
            "matches": []
        }
        final_matches = []
        for field in fields:

            flattened_doc = ElasticCore(
                check_connection=False).flatten(input_document)
            text = flattened_doc.get(field, None)
            matches = tagger_object.match_texts([text], as_texta_facts=False)

            if matches:
                for match in matches:
                    match.update(field=field)
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Пример #2
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object: RegexTaggerGroup = self.get_object()

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve tagger fields
        fields = serializer.validated_data["fields"]
        if not ElasticCore().check_if_indices_exist(
                tagger_object.project.get_indices()):
            return Response(
                {
                    'error':
                    f'One or more index from {list(tagger_object.project.get_indices())} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply tagger
        results = {
            "tagger_group_id": tagger_object.pk,
            "tagger_group_tag": tagger_object.description,
            "result": False,
            "matches": [],
            "document": flattened_doc
        }

        final_matches = []
        for field in fields:
            text = flattened_doc.get(field, None)
            results["document"][field] = text
            matches = tagger_object.match_texts([text],
                                                as_texta_facts=True,
                                                field=field)

            if matches:
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Пример #3
0
    def extract_from_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get rakun object
        rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk)

        serializer = RakunExtractorRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [index["name"] for index in serializer.validated_data["indices"]]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve rakun fields
        fields = serializer.validated_data["fields"]

        # retrieve param add_spans
        add_spans = serializer.validated_data["add_spans"]

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply rakun
        results = {
            "rakun_id": rakun_object.pk,
            "description": rakun_object.description,
            "result": False,
            "keywords": [],
            "document": flattened_doc
        }
        final_keywords = []
        keyword_detector = rakun_object.load_rakun_keyword_detector()
        for field in fields:
            text = flattened_doc.get(field, "")
            results["document"][field] = text
            keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans)

            if keywords:
                final_keywords.extend(keywords)
                results["result"] = True

        results["keywords"] = final_keywords
        return Response(results, status=status.HTTP_200_OK)
Пример #4
0
    def tag_docs(self, fields: List[str], docs: List[dict]):
        # apply tagger
        for doc in docs:
            for field in fields:
                flattened_doc = ElasticCore(
                    check_connection=False).flatten(doc)
                text = flattened_doc.get(field, None)
                matches_as_facts = self.match_texts([text],
                                                    as_texta_facts=True,
                                                    field=field)
                for fact in matches_as_facts:
                    fact.update(fact=self.description)

                pre_existing_facts = doc.get(TEXTA_TAGS_KEY, [])
                filtered_facts = ElasticDocument.remove_duplicate_facts(
                    pre_existing_facts + matches_as_facts)
                doc[TEXTA_TAGS_KEY] = filtered_facts

        return docs