示例#1
0
 def multitag_text(self, request, pk=None, project_pk=None):
     """
     Applies list of tagger objects inside project to any text.
     This is different from Tagger Group as **all** taggers in project are used and they do not have to reside in the same Tagger Group.
     Returns list of tags.
     """
     serializer = TaggerMultiTagSerializer(data=request.data)
     # validate serializer
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # get project object
     project_object = Project.objects.get(pk=project_pk)
     # get available taggers from project
     taggers = Tagger.objects.filter(project=project_object).filter(
         task__status=Task.STATUS_COMPLETED)
     # filter again
     if serializer.validated_data['taggers']:
         taggers = taggers.filter(
             pk__in=serializer.validated_data['taggers'])
     # error if filtering resulted 0 taggers
     if not taggers:
         raise NonExistantModelError(detail='No tagging models available.')
     # retrieve params
     lemmatize = serializer.validated_data['lemmatize']
     feedback = serializer.validated_data['feedback_enabled']
     text = serializer.validated_data['text']
     hide_false = serializer.validated_data['hide_false']
     # error if redis not available
     if not get_redis_status()['alive']:
         raise RedisNotAvailable()
     # lemmatize text just once before giving it to taggers!
     if lemmatize:
         text = CeleryLemmatizer().lemmatize(text)
     # tag text using celery group primitive
     group_task = group(
         apply_tagger.s(tagger.pk,
                        text,
                        input_type='text',
                        lemmatize=False,
                        feedback=feedback) for tagger in taggers)
     group_results = [
         a for a in group_task.apply(
             queue=CELERY_SHORT_TERM_TASK_QUEUE).get() if a
     ]
     # remove non-hits
     if hide_false is True:
         group_results = [a for a in group_results if a['result']]
     # if feedback was enabled, add urls
     group_results = [
         add_finite_url_to_feedback(a, request) for a in group_results
     ]
     # sort & return tags
     sorted_tags = sorted(group_results,
                          key=lambda k: k['probability'],
                          reverse=True)
     return Response(sorted_tags, status=status.HTTP_200_OK)
示例#2
0
    def get(self, request):
        """Returns health statistics about host machine and running services."""
        toolkit_status = {"services": {}, "host": {}, "toolkit": {}}
        is_anon = request.user.is_anonymous

        toolkit_status["services"]["elastic"] = get_elastic_status(is_anon)
        toolkit_status["services"]["redis"] = get_redis_status(is_anon)

        toolkit_status["toolkit"]["version"] = get_version()
        toolkit_status["toolkit"][
            "available_langs"] = DEFAULT_MLP_LANGUAGE_CODES

        if is_anon is True:
            toolkit_status.pop("host")
            toolkit_status["toolkit"].pop("available_langs")
            return Response(toolkit_status, status=status.HTTP_200_OK)

        disk_total, disk_used, disk_free = shutil.disk_usage("/")
        toolkit_status["host"]["disk"] = {
            "free": disk_free / (2**30),
            "total": disk_total / (2**30),
            "used": disk_used / (2**30),
            "unit": "GB"
        }

        memory = psutil.virtual_memory()
        toolkit_status["host"]["memory"] = {
            "free": memory.available / (2**30),
            "total": memory.total / (2**30),
            "used": memory.used / (2**30),
            "unit": "GB"
        }

        toolkit_status["host"]["cpu"] = {
            "percent": psutil.cpu_percent(),
            "count": os.cpu_count()
        }

        gpu_count = torch.cuda.device_count()
        gpu_devices = get_gpu_devices()

        toolkit_status["host"]["gpu"] = {
            "count": gpu_count,
            "devices": gpu_devices
        }
        toolkit_status["toolkit"]["active_tasks"] = get_active_tasks(
            toolkit_status["services"]["redis"]["alive"])

        return Response(toolkit_status, status=status.HTTP_200_OK)
示例#3
0
    def tag_text(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging raw text with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Text] Starting tag_text...")
        data = request.data
        serializer = TaggerGroupTagTextSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable()
        # declare tag candidates variables
        text = serializer.validated_data['text']
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk
        # update text and tags with MLP
        text, tags = get_mlp(tagger_group_id,
                             text,
                             lemmatize=lemmatize,
                             use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            text,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   text,
                                   tag_candidates,
                                   request,
                                   input_type='text',
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)
示例#4
0
    def apply_to_index(self, request, pk=None, project_pk=None):

        with transaction.atomic():
            # We're pulling the serializer with the function bc otherwise it will not
            # fetch the context for whatever reason.
            serializer = self.get_serializer(data=request.data)
            serializer.is_valid(raise_exception=True)

            tagger_group_object = self.get_object()
            tagger_group_object.task = Task.objects.create(
                taggergroup=tagger_group_object,
                status=Task.STATUS_CREATED,
                task_type=Task.TYPE_APPLY)
            tagger_group_object.save()

            if not get_redis_status()['alive']:
                raise RedisNotAvailable(
                    'Redis not available. Check if Redis is running.')

            project = Project.objects.get(pk=project_pk)
            indices = [
                index["name"] for index in serializer.validated_data["indices"]
            ]
            # indices = project.get_available_or_all_project_indices(indices)

            if not ElasticCore().check_if_indices_exist(indices):
                return Response(
                    {
                        'error':
                        f'One or more index from {indices} does not exist'
                    },
                    status=status.HTTP_400_BAD_REQUEST)

            fields = serializer.validated_data["fields"]
            fact_name = serializer.validated_data["new_fact_name"]
            query = serializer.validated_data["query"]
            bulk_size = serializer.validated_data["bulk_size"]
            max_chunk_bytes = serializer.validated_data["max_chunk_bytes"]
            es_timeout = serializer.validated_data["es_timeout"]
            use_ner = serializer.validated_data["use_ner"]
            lemmatize = serializer.validated_data["lemmatize"]
            n_similar_docs = serializer.validated_data["n_similar_docs"]
            n_candidate_tags = serializer.validated_data["n_candidate_tags"]
            max_tags = serializer.validated_data["max_tags"]

            object_args = {
                "n_similar_docs": n_similar_docs,
                "n_candidate_tags": n_candidate_tags,
                "lemmatize": lemmatize,
                "use_ner": use_ner
            }

            # fact value is always tagger description when applying the tagger group
            fact_value = ""

            # object_id = tagger_object.pk
            object_type = "tagger_group"

            args = (pk, indices, fields, fact_name, fact_value, query,
                    bulk_size, max_chunk_bytes, es_timeout, object_type,
                    object_args, max_tags)
            transaction.on_commit(lambda: apply_tagger_to_index.apply_async(
                args=args, queue=CELERY_LONG_TERM_TASK_QUEUE))

            message = "Started process of applying Tagger with id: {}".format(
                tagger_group_object.id)
            return Response({"message": message},
                            status=status.HTTP_201_CREATED)
示例#5
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging a random document.
        """
        logging.getLogger(INFO_LOGGER).info(
            f"[Tag Random doc] Starting tag_random_doc...")
        # get hybrid tagger object
        hybrid_tagger_object = self.get_object()

        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()

        # retrieve tagger fields from the first object
        first_tagger = hybrid_tagger_object.taggers.first()
        tagger_fields = json.loads(first_tagger.fields)
        # error if redis not available

        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = first_tagger.get_available_or_all_indices(indices)

        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} does not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        tagger_group_id = self.get_object().pk

        # combine document field values into one string
        combined_texts = '\n'.join(random_doc_filtered.values())
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=False)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   random_doc_filtered,
                                   tag_candidates,
                                   request,
                                   input_type='doc')
        # return document with tags
        response = {"document": random_doc, "tags": tags}
        return Response(response, status=status.HTTP_200_OK)
示例#6
0
    def tag_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging JSON documents with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Doc] Starting tag_doc...")
        data = request.data
        serializer = TaggerGroupTagDocumentSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')
        # retrieve field data from the first element
        # we can do that safely because all taggers inside
        # hybrid tagger instance are trained on same fields
        hybrid_tagger_field_data = json.loads(
            hybrid_tagger_object.taggers.first().fields)
        # declare input_document variable
        input_document = serializer.validated_data['doc']
        # validate input document
        input_document = validate_input_document(input_document,
                                                 hybrid_tagger_field_data)
        if isinstance(input_document, Exception):
            return input_document
        # combine document field values into one string
        combined_texts = '\n'.join(input_document.values())

        # declare tag candidates variables
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk

        # update text and tags with MLP
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=lemmatize,
                                       use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   input_document,
                                   tag_candidates,
                                   request,
                                   input_type='doc',
                                   lemmatize=lemmatize,
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)