Пример #1
0
    def post(self, request, project_pk: int):
        """
        Returns existing fact names and values from Elasticsearch.
        """
        serializer = ProjectFactAggregatorSerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = serializer.validated_data["indices"]
        indices = [index["name"] for index in indices]

        # retrieve and validate project indices
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        project_indices = project.get_available_or_all_project_indices(indices)  # Gives all if n   one, the default, is entered.

        if not project_indices:
            return Response([])

        key_field = serializer.validated_data["key_field"]
        value_field = serializer.validated_data["value_field"]
        filter_by_key = serializer.validated_data["filter_by_key"]
        max_count = serializer.validated_data["max_count"]
        query = serializer.validated_data["query"]

        if isinstance(query, str):
            query = json.loads(query)

        aggregator = ElasticAggregator(indices=project_indices, query=query)
        results = aggregator.facts_abstract(key_field=key_field, value_field=value_field, filter_by_key=filter_by_key, size=max_count)

        return Response(results, status=status.HTTP_200_OK)
Пример #2
0
    def stop_words(self, request, pk=None, project_pk=None):
        """Adds stop word to Rakun. Input should be a list of strings, e.g. ['word1', 'word2', 'word3']."""
        rakun_object = self.get_object()

        existing_stop_words = load_stop_words(rakun_object.stopwords)

        if self.request.method == 'GET':
            success = {'stopwords': existing_stop_words}
            return Response(success, status=status.HTTP_200_OK)

        elif self.request.method == 'POST':
            serializer = StopWordSerializer(data=request.data)

            # check if valid request
            if not serializer.is_valid():
                raise SerializerNotValid(detail=serializer.errors)

            new_stop_words = serializer.validated_data['stopwords']
            overwrite_existing = serializer.validated_data['overwrite_existing']

            if not overwrite_existing:
                # Add previous stopwords to the new ones
                new_stop_words += existing_stop_words

            # Remove duplicates
            new_stop_words = list(set(new_stop_words))

            # save rakun object
            rakun_object.stopwords = json.dumps(new_stop_words)
            rakun_object.save()

            return Response({"stopwords": new_stop_words}, status=status.HTTP_200_OK)
Пример #3
0
 def tag_doc(self, request, pk=None, project_pk=None):
     """Returns list of tags for input document."""
     serializer = TaggerTagDocumentSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model.path:
         raise NonExistantModelError()
     # declare input_document variable
     input_document = serializer.validated_data['doc']
     # load field data
     tagger_field_data = json.loads(tagger_object.fields)
     # validate input document
     input_document = validate_input_document(input_document,
                                              tagger_field_data)
     if isinstance(input_document, Exception):
         return input_document
     # apply tagger
     tagger_response = apply_tagger(
         tagger_object.id,
         input_document,
         input_type='doc',
         lemmatize=serializer.validated_data['lemmatize'],
         feedback=serializer.validated_data['feedback_enabled'],
     )
     # if feedback was enabled, add url
     tagger_response = add_finite_url_to_feedback(tagger_response, request)
     return Response(tagger_response, status=status.HTTP_200_OK)
Пример #4
0
 def multitag_text(self, request, pk=None, project_pk=None):
     """
     Applies list of tagger objects inside project to any text.
     This is different from Tagger Group as **all** taggers in project are used and they do not have to reside in the same Tagger Group.
     Returns list of tags.
     """
     serializer = TaggerMultiTagSerializer(data=request.data)
     # validate serializer
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # get project object
     project_object = Project.objects.get(pk=project_pk)
     # get available taggers from project
     taggers = Tagger.objects.filter(project=project_object).filter(
         task__status=Task.STATUS_COMPLETED)
     # filter again
     if serializer.validated_data['taggers']:
         taggers = taggers.filter(
             pk__in=serializer.validated_data['taggers'])
     # error if filtering resulted 0 taggers
     if not taggers:
         raise NonExistantModelError(detail='No tagging models available.')
     # retrieve params
     lemmatize = serializer.validated_data['lemmatize']
     feedback = serializer.validated_data['feedback_enabled']
     text = serializer.validated_data['text']
     hide_false = serializer.validated_data['hide_false']
     # error if redis not available
     if not get_redis_status()['alive']:
         raise RedisNotAvailable()
     # lemmatize text just once before giving it to taggers!
     if lemmatize:
         text = CeleryLemmatizer().lemmatize(text)
     # tag text using celery group primitive
     group_task = group(
         apply_tagger.s(tagger.pk,
                        text,
                        input_type='text',
                        lemmatize=False,
                        feedback=feedback) for tagger in taggers)
     group_results = [
         a for a in group_task.apply(
             queue=CELERY_SHORT_TERM_TASK_QUEUE).get() if a
     ]
     # remove non-hits
     if hide_false is True:
         group_results = [a for a in group_results if a['result']]
     # if feedback was enabled, add urls
     group_results = [
         add_finite_url_to_feedback(a, request) for a in group_results
     ]
     # sort & return tags
     sorted_tags = sorted(group_results,
                          key=lambda k: k['probability'],
                          reverse=True)
     return Response(sorted_tags, status=status.HTTP_200_OK)
Пример #5
0
    def post(self, request, project_pk: int):
        """
        Returns existing fact names and values from Elasticsearch.
        """
        serializer = ProjectGetFactsSerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = serializer.validated_data["indices"]
        indices = [index["name"] for index in indices]

        # retrieve and validate project indices
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        project_indices = project.get_available_or_all_project_indices(indices)  # Gives all if n   one, the default, is entered.

        if not project_indices:
            return Response([])

        vals_per_name = serializer.validated_data['values_per_name']
        include_values = serializer.validated_data['include_values']
        fact_name = serializer.validated_data['fact_name']
        include_doc_path = serializer.validated_data['include_doc_path']
        exclude_zero_spans = serializer.validated_data['exclude_zero_spans']
        mlp_doc_path = serializer.validated_data['mlp_doc_path']

        aggregator = ElasticAggregator(indices=project_indices)

        if mlp_doc_path and exclude_zero_spans:
            # If exclude_zerp_spans is enabled and mlp_doc_path specified, the other values don't have any effect -
            # this behaviour might need to change at some point
            fact_map = aggregator.facts(size=1, include_values=True, include_doc_path=True, exclude_zero_spans=exclude_zero_spans)

        else:
            fact_map = aggregator.facts(size=vals_per_name, include_values=include_values, filter_by_fact_name=fact_name, include_doc_path=include_doc_path, exclude_zero_spans=exclude_zero_spans)

        if fact_name:
            fact_map_list = [v for v in fact_map]

        elif mlp_doc_path and exclude_zero_spans:
            # Return only fact names where doc_path contains mlp_doc_path as a parent field and facts have spans.
            # NB! Doesn't take into account the situation where facts have the same name, but different doc paths! Could happen!
            fact_map_list = [k for k, v in fact_map.items() if v and mlp_doc_path == v[0]["doc_path"].rsplit(".", 1)[0]]

        elif include_values:
            fact_map_list = [{'name': k, 'values': v} for k, v in fact_map.items()]
        else:
            fact_map_list = [v for v in fact_map]
        return Response(fact_map_list, status=status.HTTP_200_OK)
Пример #6
0
    def tag_text(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging raw text with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Text] Starting tag_text...")
        data = request.data
        serializer = TaggerGroupTagTextSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable()
        # declare tag candidates variables
        text = serializer.validated_data['text']
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk
        # update text and tags with MLP
        text, tags = get_mlp(tagger_group_id,
                             text,
                             lemmatize=lemmatize,
                             use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            text,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   text,
                                   tag_candidates,
                                   request,
                                   input_type='text',
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)
Пример #7
0
    def autocomplete_fact_names(self, request, pk=None, project_pk=None):
        serializer = ProjectSuggestFactNamesSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        project_object: Project = self.get_object()
        indices = [index["name"] for index in serializer.validated_data["indices"]]
        indices = project_object.get_available_or_all_project_indices(indices)

        if not indices:
            return Response([])

        limit = serializer.validated_data['limit']
        startswith = serializer.validated_data['startswith']

        autocomplete = Autocomplete(project_object, indices, limit)
        fact_values = autocomplete.get_fact_names(startswith)

        return Response(fact_values, status=status.HTTP_200_OK)
Пример #8
0
    def post(self, request, project_pk: int):
        project: Project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)

        serializer = ProjectDocumentSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])
        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        doc_id = serializer.validated_data["doc_id"]
        if not doc_id:
            raise InvalidInputDocument(detail="No doc_id supplied")

        es = ElasticDocument(index=indices)
        results = es.get(doc_id)
        return Response(results, status=status.HTTP_200_OK)
Пример #9
0
    def post(self, request, project_pk: int):
        """Simplified search interface for making Elasticsearch queries."""
        serializer = ProjectSimplifiedSearchSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        project_object = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project_object)
        project_indices = list(project_object.get_indices())
        project_fields = project_object.get_elastic_fields(path_list=True)
        # test if indices exist
        if not project_indices:
            raise ProjectValidationFailed(detail="Project has no indices")
        # test if indices are valid
        if serializer.validated_data['match_indices']:
            if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)):
                raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}")
        # test if fields are valid
        if serializer.validated_data['match_fields']:
            if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)):
                raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}")

        es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC)
        q = Query(operator=serializer.validated_data['operator'])
        # if input is string, convert to list
        # if unknown format, return error
        match_text = serializer.validated_data['match_text']
        if isinstance(match_text, list):
            match_texts = [str(item) for item in match_text if item]
        elif isinstance(match_text, str):
            match_texts = [match_text]
        else:
            return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST)
        # add query filters
        for item in match_texts:
            q.add_string_filter(item, match_type=serializer.validated_data["match_type"])
        # update query
        es.update_query(q.query)
        # retrieve results
        results = es.search(size=serializer.validated_data["size"])
        return Response(results, status=status.HTTP_200_OK)
Пример #10
0
 def tag_text(self, request, pk=None, project_pk=None):
     """Returns list of tags for input text."""
     serializer = TaggerTagTextSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model.path:
         raise NonExistantModelError()
     # apply tagger
     tagger_response = apply_tagger(
         tagger_object.id,
         serializer.validated_data['text'],
         input_type='text',
         lemmatize=serializer.validated_data['lemmatize'],
         feedback=serializer.validated_data['feedback_enabled'])
     # if feedback was enabled, add url
     tagger_response = add_finite_url_to_feedback(tagger_response, request)
     return Response(tagger_response, status=status.HTTP_200_OK)
Пример #11
0
 def tag_text(self, request, pk=None, project_pk=None):
     serializer = CRFExtractorTagTextSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     extractor: CRFExtractor = self.get_object()
     # check if tagger exists
     if not extractor.model.path:
         raise NonExistantModelError()
     # apply mlp
     text = serializer.validated_data["text"]
     with allow_join_result():
         mlp = apply_mlp_on_list.apply_async(kwargs={"texts": [text], "analyzers": extractor.mlp_analyzers}, queue=CELERY_MLP_TASK_QUEUE).get()
         mlp_document = mlp[0]
     # apply extractor
     extractor_response = apply_crf_extractor(
         extractor.id,
         mlp_document
     )
     return Response(extractor_response, status=status.HTTP_200_OK)
Пример #12
0
    def post(self, request, project_pk: int):
        """Executes **raw** Elasticsearch query on all project indices."""
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        serializer = ProjectSearchByQuerySerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])

        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        es = None
        if serializer.validated_data["output_type"]:
            es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"])
        else:
            es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS)

        es.update_query(serializer.validated_data["query"])
        results = es.search()
        return Response(results, status=status.HTTP_200_OK)
Пример #13
0
    def tag_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging JSON documents with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Doc] Starting tag_doc...")
        data = request.data
        serializer = TaggerGroupTagDocumentSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')
        # retrieve field data from the first element
        # we can do that safely because all taggers inside
        # hybrid tagger instance are trained on same fields
        hybrid_tagger_field_data = json.loads(
            hybrid_tagger_object.taggers.first().fields)
        # declare input_document variable
        input_document = serializer.validated_data['doc']
        # validate input document
        input_document = validate_input_document(input_document,
                                                 hybrid_tagger_field_data)
        if isinstance(input_document, Exception):
            return input_document
        # combine document field values into one string
        combined_texts = '\n'.join(input_document.values())

        # declare tag candidates variables
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk

        # update text and tags with MLP
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=lemmatize,
                                       use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   input_document,
                                   tag_candidates,
                                   request,
                                   input_type='doc',
                                   lemmatize=lemmatize,
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)