def post(self, request, project_pk: int): """ Returns existing fact names and values from Elasticsearch. """ serializer = ProjectFactAggregatorSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = serializer.validated_data["indices"] indices = [index["name"] for index in indices] # retrieve and validate project indices project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) project_indices = project.get_available_or_all_project_indices(indices) # Gives all if n one, the default, is entered. if not project_indices: return Response([]) key_field = serializer.validated_data["key_field"] value_field = serializer.validated_data["value_field"] filter_by_key = serializer.validated_data["filter_by_key"] max_count = serializer.validated_data["max_count"] query = serializer.validated_data["query"] if isinstance(query, str): query = json.loads(query) aggregator = ElasticAggregator(indices=project_indices, query=query) results = aggregator.facts_abstract(key_field=key_field, value_field=value_field, filter_by_key=filter_by_key, size=max_count) return Response(results, status=status.HTTP_200_OK)
def stop_words(self, request, pk=None, project_pk=None): """Adds stop word to Rakun. Input should be a list of strings, e.g. ['word1', 'word2', 'word3'].""" rakun_object = self.get_object() existing_stop_words = load_stop_words(rakun_object.stopwords) if self.request.method == 'GET': success = {'stopwords': existing_stop_words} return Response(success, status=status.HTTP_200_OK) elif self.request.method == 'POST': serializer = StopWordSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) new_stop_words = serializer.validated_data['stopwords'] overwrite_existing = serializer.validated_data['overwrite_existing'] if not overwrite_existing: # Add previous stopwords to the new ones new_stop_words += existing_stop_words # Remove duplicates new_stop_words = list(set(new_stop_words)) # save rakun object rakun_object.stopwords = json.dumps(new_stop_words) rakun_object.save() return Response({"stopwords": new_stop_words}, status=status.HTTP_200_OK)
def tag_doc(self, request, pk=None, project_pk=None): """Returns list of tags for input document.""" serializer = TaggerTagDocumentSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() # declare input_document variable input_document = serializer.validated_data['doc'] # load field data tagger_field_data = json.loads(tagger_object.fields) # validate input document input_document = validate_input_document(input_document, tagger_field_data) if isinstance(input_document, Exception): return input_document # apply tagger tagger_response = apply_tagger( tagger_object.id, input_document, input_type='doc', lemmatize=serializer.validated_data['lemmatize'], feedback=serializer.validated_data['feedback_enabled'], ) # if feedback was enabled, add url tagger_response = add_finite_url_to_feedback(tagger_response, request) return Response(tagger_response, status=status.HTTP_200_OK)
def multitag_text(self, request, pk=None, project_pk=None): """ Applies list of tagger objects inside project to any text. This is different from Tagger Group as **all** taggers in project are used and they do not have to reside in the same Tagger Group. Returns list of tags. """ serializer = TaggerMultiTagSerializer(data=request.data) # validate serializer if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # get project object project_object = Project.objects.get(pk=project_pk) # get available taggers from project taggers = Tagger.objects.filter(project=project_object).filter( task__status=Task.STATUS_COMPLETED) # filter again if serializer.validated_data['taggers']: taggers = taggers.filter( pk__in=serializer.validated_data['taggers']) # error if filtering resulted 0 taggers if not taggers: raise NonExistantModelError(detail='No tagging models available.') # retrieve params lemmatize = serializer.validated_data['lemmatize'] feedback = serializer.validated_data['feedback_enabled'] text = serializer.validated_data['text'] hide_false = serializer.validated_data['hide_false'] # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable() # lemmatize text just once before giving it to taggers! if lemmatize: text = CeleryLemmatizer().lemmatize(text) # tag text using celery group primitive group_task = group( apply_tagger.s(tagger.pk, text, input_type='text', lemmatize=False, feedback=feedback) for tagger in taggers) group_results = [ a for a in group_task.apply( queue=CELERY_SHORT_TERM_TASK_QUEUE).get() if a ] # remove non-hits if hide_false is True: group_results = [a for a in group_results if a['result']] # if feedback was enabled, add urls group_results = [ add_finite_url_to_feedback(a, request) for a in group_results ] # sort & return tags sorted_tags = sorted(group_results, key=lambda k: k['probability'], reverse=True) return Response(sorted_tags, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): """ Returns existing fact names and values from Elasticsearch. """ serializer = ProjectGetFactsSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = serializer.validated_data["indices"] indices = [index["name"] for index in indices] # retrieve and validate project indices project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) project_indices = project.get_available_or_all_project_indices(indices) # Gives all if n one, the default, is entered. if not project_indices: return Response([]) vals_per_name = serializer.validated_data['values_per_name'] include_values = serializer.validated_data['include_values'] fact_name = serializer.validated_data['fact_name'] include_doc_path = serializer.validated_data['include_doc_path'] exclude_zero_spans = serializer.validated_data['exclude_zero_spans'] mlp_doc_path = serializer.validated_data['mlp_doc_path'] aggregator = ElasticAggregator(indices=project_indices) if mlp_doc_path and exclude_zero_spans: # If exclude_zerp_spans is enabled and mlp_doc_path specified, the other values don't have any effect - # this behaviour might need to change at some point fact_map = aggregator.facts(size=1, include_values=True, include_doc_path=True, exclude_zero_spans=exclude_zero_spans) else: fact_map = aggregator.facts(size=vals_per_name, include_values=include_values, filter_by_fact_name=fact_name, include_doc_path=include_doc_path, exclude_zero_spans=exclude_zero_spans) if fact_name: fact_map_list = [v for v in fact_map] elif mlp_doc_path and exclude_zero_spans: # Return only fact names where doc_path contains mlp_doc_path as a parent field and facts have spans. # NB! Doesn't take into account the situation where facts have the same name, but different doc paths! Could happen! fact_map_list = [k for k, v in fact_map.items() if v and mlp_doc_path == v[0]["doc_path"].rsplit(".", 1)[0]] elif include_values: fact_map_list = [{'name': k, 'values': v} for k, v in fact_map.items()] else: fact_map_list = [v for v in fact_map] return Response(fact_map_list, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): """ API endpoint for tagging raw text with tagger group. """ logging.getLogger(INFO_LOGGER).info(f"[Tag Text] Starting tag_text...") data = request.data serializer = TaggerGroupTagTextSerializer(data=data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable() # declare tag candidates variables text = serializer.validated_data['text'] n_similar_docs = serializer.validated_data['n_similar_docs'] n_candidate_tags = serializer.validated_data['n_candidate_tags'] lemmatize = serializer.validated_data['lemmatize'] use_ner = serializer.validated_data['use_ner'] feedback = serializer.validated_data['feedback_enabled'] tagger_group_id = self.get_object().pk # update text and tags with MLP text, tags = get_mlp(tagger_group_id, text, lemmatize=lemmatize, use_ner=use_ner) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, text, ignore_tags=tags, n_similar_docs=n_similar_docs, max_candidates=n_candidate_tags) # get tags tags += apply_tagger_group(tagger_group_id, text, tag_candidates, request, input_type='text', feedback=feedback) return Response(tags, status=status.HTTP_200_OK)
def autocomplete_fact_names(self, request, pk=None, project_pk=None): serializer = ProjectSuggestFactNamesSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) project_object: Project = self.get_object() indices = [index["name"] for index in serializer.validated_data["indices"]] indices = project_object.get_available_or_all_project_indices(indices) if not indices: return Response([]) limit = serializer.validated_data['limit'] startswith = serializer.validated_data['startswith'] autocomplete = Autocomplete(project_object, indices, limit) fact_values = autocomplete.get_fact_names(startswith) return Response(fact_values, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): project: Project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectDocumentSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") doc_id = serializer.validated_data["doc_id"] if not doc_id: raise InvalidInputDocument(detail="No doc_id supplied") es = ElasticDocument(index=indices) results = es.get(doc_id) return Response(results, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): """Simplified search interface for making Elasticsearch queries.""" serializer = ProjectSimplifiedSearchSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) project_object = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project_object) project_indices = list(project_object.get_indices()) project_fields = project_object.get_elastic_fields(path_list=True) # test if indices exist if not project_indices: raise ProjectValidationFailed(detail="Project has no indices") # test if indices are valid if serializer.validated_data['match_indices']: if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)): raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}") # test if fields are valid if serializer.validated_data['match_fields']: if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)): raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}") es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC) q = Query(operator=serializer.validated_data['operator']) # if input is string, convert to list # if unknown format, return error match_text = serializer.validated_data['match_text'] if isinstance(match_text, list): match_texts = [str(item) for item in match_text if item] elif isinstance(match_text, str): match_texts = [match_text] else: return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST) # add query filters for item in match_texts: q.add_string_filter(item, match_type=serializer.validated_data["match_type"]) # update query es.update_query(q.query) # retrieve results results = es.search(size=serializer.validated_data["size"]) return Response(results, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): """Returns list of tags for input text.""" serializer = TaggerTagTextSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() # apply tagger tagger_response = apply_tagger( tagger_object.id, serializer.validated_data['text'], input_type='text', lemmatize=serializer.validated_data['lemmatize'], feedback=serializer.validated_data['feedback_enabled']) # if feedback was enabled, add url tagger_response = add_finite_url_to_feedback(tagger_response, request) return Response(tagger_response, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): serializer = CRFExtractorTagTextSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object extractor: CRFExtractor = self.get_object() # check if tagger exists if not extractor.model.path: raise NonExistantModelError() # apply mlp text = serializer.validated_data["text"] with allow_join_result(): mlp = apply_mlp_on_list.apply_async(kwargs={"texts": [text], "analyzers": extractor.mlp_analyzers}, queue=CELERY_MLP_TASK_QUEUE).get() mlp_document = mlp[0] # apply extractor extractor_response = apply_crf_extractor( extractor.id, mlp_document ) return Response(extractor_response, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): """Executes **raw** Elasticsearch query on all project indices.""" project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectSearchByQuerySerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") es = None if serializer.validated_data["output_type"]: es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"]) else: es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS) es.update_query(serializer.validated_data["query"]) results = es.search() return Response(results, status=status.HTTP_200_OK)
def tag_doc(self, request, pk=None, project_pk=None): """ API endpoint for tagging JSON documents with tagger group. """ logging.getLogger(INFO_LOGGER).info(f"[Tag Doc] Starting tag_doc...") data = request.data serializer = TaggerGroupTagDocumentSerializer(data=data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable( 'Redis not available. Check if Redis is running.') # retrieve field data from the first element # we can do that safely because all taggers inside # hybrid tagger instance are trained on same fields hybrid_tagger_field_data = json.loads( hybrid_tagger_object.taggers.first().fields) # declare input_document variable input_document = serializer.validated_data['doc'] # validate input document input_document = validate_input_document(input_document, hybrid_tagger_field_data) if isinstance(input_document, Exception): return input_document # combine document field values into one string combined_texts = '\n'.join(input_document.values()) # declare tag candidates variables n_similar_docs = serializer.validated_data['n_similar_docs'] n_candidate_tags = serializer.validated_data['n_candidate_tags'] lemmatize = serializer.validated_data['lemmatize'] use_ner = serializer.validated_data['use_ner'] feedback = serializer.validated_data['feedback_enabled'] tagger_group_id = self.get_object().pk # update text and tags with MLP combined_texts, tags = get_mlp(tagger_group_id, combined_texts, lemmatize=lemmatize, use_ner=use_ner) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, combined_texts, ignore_tags=tags, n_similar_docs=n_similar_docs, max_candidates=n_candidate_tags) # get tags tags += apply_tagger_group(tagger_group_id, input_document, tag_candidates, request, input_type='doc', lemmatize=lemmatize, feedback=feedback) return Response(tags, status=status.HTTP_200_OK)