Exemplo n.º 1
0
    def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""):
        """
        This is a helper function to parse all the given fields and use the document_ids
        as input to make a significant_words aggregation.
        Args:
            exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations.
            stop_words: Optional parameter to remove stopwords from the results.
            indices: Indices from which to perform the aggregation.
            fields: From which fields can you get the text content needed for comparison.
            document_ids: IDs of the documents you want to use as baseline for the aggregation.

        Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents.

        """
        ed = ElasticDocument("*")
        ea = ElasticAggregator(indices=indices)

        stop_words = StopWords._get_stop_words(custom_stop_words=stop_words)
        # Validate that those documents exist.
        validated_docs: List[dict] = ed.get_bulk(document_ids)
        if validated_docs:
            unique_ids = list(set([index["_id"] for index in validated_docs]))
            significant_words = []
            for field in fields:
                sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude)
                significant_words += sw

            return significant_words
        else:
            return []
Exemplo n.º 2
0
 def get(self, request, pk: int, index: str, document_id: str):
     validate_index_and_project_perms(request, pk, index)
     ed = ElasticDocument(index)
     document = ed.get(document_id)
     if not document:
         raise NotFound(f"Could not find document with ID '{document_id}' from index '{index}'!")
     return Response(document)
Exemplo n.º 3
0
 def _get_sample_document(self, id_field: str, id_value: str, index: str):
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW)
     ed = ElasticDocument(index=index)
     response = es.search()["hits"]["hits"]
     document = response[0] if response else None
     return ed, document
Exemplo n.º 4
0
    def post(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)
        serializer = self.get_serializer(data=request.data)
        if serializer.is_valid(raise_exception=True):
            ed = ElasticDocument(index)
            document = ed.get(document_id, fields=[TEXTA_TAGS_KEY])
            if not document:
                raise NotFound(
                    f"Could not find document with ID '{document_id}' from index '{index}'!"
                )

            document = document.get("_source")
            target_facts = serializer.validated_data.get("facts", [])
            existing_facts = document.get(TEXTA_TAGS_KEY, [])

            new_facts = []
            for index_count, existing_fact in enumerate(existing_facts):
                for fact in target_facts:
                    if not (fact.items() <= existing_fact.items()):
                        new_facts.append(existing_fact)

            document[TEXTA_TAGS_KEY] = new_facts
            ed.update(index, document_id, doc=document)
            return Response({
                "message":
                f"Removed given facts from document with the ID of {document_id}!"
            })
Exemplo n.º 5
0
    def post(self, request, pk: int):
        ed = ElasticDocument(index=None)

        # Validate payload and project permissions.
        serializer: InsertDocumentsSerializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)
        project = get_object_or_404(Project, pk=pk)
        if project.users.filter(pk=request.user.pk).exists() is False:
            raise PermissionDenied("You do not have permissions for this project!")

        # Split indices on whether they have index access or lack any index details at all.
        documents = serializer.validated_data["documents"]
        split_fields = serializer.validated_data["split_text_in_fields"]
        indices = project.get_indices()

        correct_actions, failed_actions, missing_actions = self._split_documents_per_index(allowed_indices=indices, documents=documents)
        missing_actions, index_name, has_new_index = self._normalize_missing_index_values(missing_actions, project.pk, indices)
        split_actions = self._split_text(correct_actions + missing_actions, split_fields)

        if has_new_index:
            ed.core.create_index(index_name)
            ed.core.add_texta_facts_mapping(index_name)
            index, is_created = Index.objects.get_or_create(name=index_name, is_open=True)
            project.indices.add(index)

        # Send the documents to Elasticsearch.
        success_count, errors = ed.bulk_add_generator(actions=split_actions, stats_only=False)
        return Response(
            {
                "successfully_indexed": success_count,
                "errors": errors,
                "failed_index_permissions": len(failed_actions)
            }
        )
Exemplo n.º 6
0
def apply_mlp_on_es_docs(self, source_and_meta_docs: List[str], mlp_id: int):
    """
    Applies MLP on documents received by previous tasks and updates them in Elasticsearch.
    :param self: Reference to the Celery Task object of this task, courtesy of the bind parameter in the decorator.
    :param source_and_meta_docs: List of Elasticsearch document ID's to pull from Elasticsearch.
    :param mlp_id: ID of the MLPObject which contains progress.
    """
    mlp_object = get_mlp_object(mlp_id)

    task_object = mlp_object.task

    # Get the necessary fields.
    field_data: List[str] = json.loads(mlp_object.fields)
    if TEXTA_TAGS_KEY not in field_data:
        # Add in existing facts so that proper duplicate filtering would be applied.
        field_data.append(TEXTA_TAGS_KEY)

    analyzers: List[str] = json.loads(mlp_object.analyzers)

    # retrieve document from ES
    document_wrapper = ElasticDocument(index=None)
    source_and_meta_docs = document_wrapper.get_bulk(
        doc_ids=source_and_meta_docs, fields=field_data)

    source_documents = [doc["_source"] for doc in source_and_meta_docs]
    mlp_docs = apply_mlp_on_documents(source_documents, analyzers, field_data,
                                      mlp_id)
    es_documents = unite_source_with_meta(source_and_meta_docs, mlp_docs)
    update_documents_in_es(es_documents)

    # Update progress
    task_object.update_progress_iter(len(source_and_meta_docs))
    return True
Exemplo n.º 7
0
 def _initialize_es(self, project_pk, text_processor, callback_progress,
                    prediction_to_match):
     # create es doc
     es_doc = ElasticDocument(self.feedback_index)
     # if no model objects, return nones for query and search
     if not self.model_object:
         return es_doc, None, None
     # create mathing query
     query = Query()
     query.add_string_filter(query_string=self.model_object.MODEL_TYPE,
                             fields=["model_type"])
     if self.model_object:
         query.add_string_filter(query_string=str(self.model_object.pk),
                                 fields=["model_id"])
     if prediction_to_match:
         query.add_string_filter(query_string=prediction_to_match,
                                 fields=["correct_result"])
     # if no index, don't create searcher object
     if not self.check_index_exists():
         return es_doc, None, query.query
     # create es search
     es_search = ElasticSearcher(indices=self.feedback_index,
                                 query=query.query,
                                 text_processor=text_processor,
                                 output=ElasticSearcher.OUT_DOC_WITH_ID,
                                 callback_progress=callback_progress)
     # return objects
     return es_doc, es_search, query.query
Exemplo n.º 8
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Exemplo n.º 9
0
def update_documents_in_es(documents: List[dict]):
    """
    Updates the documents inside Elasticsearch, either with the MLP results or the
    error messages.

    :param documents: Full Elasticsearch documents..
    """
    ed = ElasticDocument(index=None)
    ed.bulk_update(actions=documents)
Exemplo n.º 10
0
    def delete(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)

        try:
            ed = ElasticDocument(index)
            document = ed.delete(doc_id=document_id)
            return Response(document)
        except texta_elastic.exceptions.NotFoundError:
            return Response(status=status.HTTP_404_NOT_FOUND)
Exemplo n.º 11
0
def tag_cluster(self, cluster_pk: int, clustering_object_pk: int, fact: dict):
    ed = ElasticDocument("")
    cluster = Cluster.objects.get(pk=cluster_pk)
    clustering_object = ClusteringResult.objects.get(pk=clustering_object_pk)
    doc_ids = json.loads(cluster.document_ids)
    ignored_ids = json.loads(clustering_object.ignored_ids)
    ed.add_fact_to_documents(fact=fact, doc_ids=doc_ids)
    clustering_object.ignored_ids = json.dumps(doc_ids + ignored_ids)
    clustering_object.save()
    return True
Exemplo n.º 12
0
    def patch(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)

        try:
            ed = ElasticDocument(index)
            document = ed.update(index=index, doc_id=document_id, doc=request.data)
            return Response(document)
        except elasticsearch.exceptions.RequestError as e:
            if e.error == "mapper_parsing_exception":  # TODO Extend the decorator with different variants of the request error instead.
                return Response(e.info["error"]["root_cause"], status=status.HTTP_400_BAD_REQUEST)
        except texta_elastic.exceptions.NotFoundError:
            return Response(status=status.HTTP_404_NOT_FOUND)
Exemplo n.º 13
0
    def pull_document_by_id(self, request, pk=None, project_pk=None):
        annotator: Annotator = self.get_object()
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        ed = ElasticDocument(index=annotator.get_indices())
        document_id = serializer.validated_data["document_id"]
        document = ed.get(document_id)
        if document:
            document = self._process_document_output(document, annotator)
            return Response(document)
        else:
            return Response({"message": "No such document!"},
                            status=status.HTTP_404_NOT_FOUND)
Exemplo n.º 14
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemplo n.º 15
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Exemplo n.º 16
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemplo n.º 17
0
    def import_dataset(self) -> list:
        error_container = []
        # retrieve content from file
        success, file_content = self._get_file_content()
        file_content = file_content.dropna(how="all")

        # check if file was parsed
        if not success:
            error_container.append('unknown file type')
            return error_container

        # convert content to list of records (dicts)
        records = file_content.to_dict(orient='records')
        # set num_records
        self.num_records = len(records)
        # set total number of documents for progress
        if self.show_progress:
            self.show_progress.set_total(self.num_records)

        # add documents to ES
        es_doc = ElasticDocument(self.index)
        # create index
        es_doc.core.create_index(self.index)
        # add mapping for texta facts
        es_doc.core.add_texta_facts_mapping(self.index)
        # get records
        chunk_size = 500
        records = [{
            k: v
            for k, v in record.items() if pd.Series(v).notna().all()
        } for record in records]
        record_chunks = list(chunks(records, chunk_size))

        for documents in record_chunks:
            success, errors = es_doc.bulk_add(documents,
                                              chunk_size=chunk_size,
                                              stats_only=False,
                                              raise_on_error=False)
            self.num_records_success += success
            if self.show_progress:
                self.show_progress.update(success)

            for error in list(errors):
                message = error["index"]["error"]["reason"] if isinstance(
                    error, dict) else str(error)
                error_container.append(message)

        return error_container
Exemplo n.º 18
0
def apply_crf_extractor_to_index(object_id: int, indices: List[str],
                                 mlp_fields: List[str], label_suffix: str,
                                 query: dict, bulk_size: int,
                                 max_chunk_bytes: int, es_timeout: int):
    """
    Applies Extractor to ES index.
    """
    try:
        # load model
        crf_object = CRFExtractorObject.objects.get(pk=object_id)
        extractor = crf_object.load_extractor()
        # progress
        progress = ShowProgress(crf_object.task)
        # add fact field if missing
        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]
        # search
        searcher = ElasticSearcher(
            indices=indices,
            field_data=mlp_fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            output=ElasticSearcher.OUT_RAW,
            timeout=f"{es_timeout}m",
            callback_progress=progress,
            scroll_size=bulk_size)
        # create update actions
        actions = update_generator(generator=searcher,
                                   ec=ec,
                                   mlp_fields=mlp_fields,
                                   label_suffix=label_suffix,
                                   object_id=object_id,
                                   extractor=extractor)
        # perform updates
        try:
            # as we have defined indices in actions there is no need to do it again (None)
            ElasticDocument(None).bulk_update(actions)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        # all done
        crf_object.task.complete()
        return True

    except Exception as e:
        crf_object.task.handle_failed_task(e)
        raise e
Exemplo n.º 19
0
    def setUp(self):
        self.user = create_test_user('user', '*****@*****.**', 'pw')
        self.new_test_index_name = f"ttk_test_query_tagger_{uuid.uuid4().hex[:5]}"
        self.project = project_creation("SearchQueryTaggerTestProject", self.new_test_index_name, self.user)
        self.project.users.add(self.user)
        self.url = reverse("v2:search_query_tagger-list", kwargs={"project_pk": self.project.pk})

        self.uuid = "adasda-5874856a-das4das98f5"
        self.document = {
            "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.",
            "Field_2": "This is a different sentence.",
            "Field_3": "This is test data.",
            "uuid": self.uuid}

        self.ed = ElasticDocument(index=self.new_test_index_name)

        self.ed.add(self.document)
        self.client.login(username='******', password='******')
Exemplo n.º 20
0
    def post(self, request, project_pk: int):
        project: Project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)

        serializer = ProjectDocumentSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])
        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        doc_id = serializer.validated_data["doc_id"]
        if not doc_id:
            raise InvalidInputDocument(detail="No doc_id supplied")

        es = ElasticDocument(index=indices)
        results = es.get(doc_id)
        return Response(results, status=status.HTTP_200_OK)
Exemplo n.º 21
0
    def count_indices(self, request, pk=None, project_pk=None):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid()

        indices = [{"name": name} for name in serializer.validated_data.get("indices", [])]
        serializer = IndexSerializer(data=indices, many=True)
        serializer.is_valid(raise_exception=True)

        project: Project = self.get_object()
        ed = ElasticDocument(None)

        indices = [index["name"] for index in indices]
        if indices:
            # We check for indices before to prevent the default behaviour of picking all the indices in project.
            indices = project.get_available_or_all_project_indices(indices)
            count = ed.count(indices=indices)
            return Response(count)
        else:
            return Response(0)
Exemplo n.º 22
0
    def setUpTestData(cls):
        cls.user = create_test_user('user', '*****@*****.**', 'pw')
        cls.project = project_creation("SummarizerTestProject",
                                       "test_summarizer_index", cls.user)
        cls.project.users.add(cls.user)
        cls.url = reverse("v2:summarizer_index-list",
                          kwargs={"project_pk": cls.project.pk})

        cls.uuid = "adasda-5874856a-das4das98f4"
        cls.document = {
            "Field_1":
            "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.",
            "uuid": cls.uuid
        }

        cls.ed = ElasticDocument(index="test_summarizer_index")

        cls.ed.add(cls.document)

        cls.summarizer_id = None
Exemplo n.º 23
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemplo n.º 24
0
def apply_summarizer_on_index(self, summarizer_id: int):
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    task_object = summarizer_object.task
    try:
        load_sumy()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling summarizer')

        # Get the necessary fields.
        indices: List[str] = summarizer_object.get_indices()
        field_data: List[str] = json.loads(summarizer_object.fields)
        ratio_data: float[str] = summarizer_object.ratio
        algorithm_data: List[str] = summarizer_object.algorithm

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(summarizer_object.query),
                                   indices=indices,
                                   field_data=field_data,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="30m")

        actions = process_actions(searcher,
                                  field_data,
                                  ratio_data,
                                  algorithm=algorithm_data,
                                  summarizer_class=sumy,
                                  summarizer_id=summarizer_id)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return summarizer_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemplo n.º 25
0
    def retrieve(self, request, *args, **kwargs):
        # API v1 to v2 compliance
        if "clustering_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["clustering_pk"]
        elif "topic_analyzer_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["topic_analyzer_pk"]

        queryset = Cluster.objects.filter(
            clusteringresult__project__pk=self.kwargs["project_pk"],
            clusteringresult__pk=topic_analyzer_pk)
        cluster = get_object_or_404(queryset, pk=self.kwargs["pk"])

        doc_ids = json.loads(cluster.document_ids)
        fields = json.loads(cluster.fields)
        indices = json.loads(cluster.indices)
        significant_words = json.loads(cluster.significant_words)
        display_fields = json.loads(cluster.display_fields)

        if display_fields:
            fields += display_fields

        ed = ElasticDocument(index=",".join(indices))

        documents = ed.get_bulk(doc_ids, flatten=True)
        documents = documents if documents else []
        documents = [{
            "id": doc["_id"],
            "index": doc["_index"],
            "content": doc["_source"]
        } for doc in documents]

        formated_cluster = {
            "id": cluster.pk,
            "intracluster_similarity": cluster.intracluster_similarity,
            "document_count": cluster.get_document_count(),
            "significant_words": significant_words,
            "documents": documents
        }
        return Response(formated_cluster)
Exemplo n.º 26
0
    def update(self, request, *args, **kwargs):
        serializer = ClusterSerializer(data=request.data, partial=True)
        serializer.is_valid()

        cluster = Cluster.objects.get(pk=kwargs["pk"])
        clustering_object = ClusteringResult.objects.get(
            pk=kwargs["topic_analyzer_pk"])

        fields = json.loads(cluster.fields)
        stop_words = json.loads(clustering_object.stop_words)
        indices = json.loads(cluster.indices)

        if "document_ids" in serializer.validated_data:
            document_ids = serializer.validated_data["document_ids"]
            ed = ElasticDocument("*")

            # Validate that those documents exist.
            validated_docs = ed.get_bulk(document_ids)
            if validated_docs:
                unique_ids = list(
                    set([index["_id"] for index in validated_docs]))
                cluster.document_ids = json.dumps(unique_ids)

                sw = Cluster.get_significant_words(indices=indices,
                                                   fields=fields,
                                                   document_ids=unique_ids,
                                                   stop_words=stop_words)
                cluster.significant_words = json.dumps(sw)

                cluster_content = ClusterContent(
                    unique_ids,
                    vectors_filepath=clustering_object.vector_model.name)
                cluster.intracluster_similarity = cluster_content.get_intracluster_similarity(
                )
            else:
                cluster.document_ids = json.dumps([])

        cluster.save()
        return Response({"message": "Cluster has been updated successfully!"})
Exemplo n.º 27
0
    def skip_document(self, request, pk=None, project_pk=None):
        serializer: DocumentIDSerializer = self.get_serializer(
            data=request.data)
        serializer.is_valid(raise_exception=True)
        annotator: Annotator = self.get_object()

        ed = ElasticDocument(index=annotator.get_indices())
        document_id = serializer.validated_data["document_id"]
        document = ed.get(document_id)
        texta_annotations = document["_source"].get("texta_annotator", [])

        processed_timestamp = None
        if texta_annotations:
            for texta_annotation in texta_annotations:
                processed_timestamp = texta_annotation.get(
                    "processed_timestamp_utc", None)

                if processed_timestamp:
                    return Response({
                        "detail":
                        f"Document with ID: {serializer.validated_data['document_id']} is already annotated"
                    })

            annotator.skip_document(serializer.validated_data["document_id"],
                                    serializer.validated_data["index"],
                                    user=request.user)
            return Response({
                "detail":
                f"Skipped document with ID: {serializer.validated_data['document_id']}"
            })
        else:
            annotator.skip_document(serializer.validated_data["document_id"],
                                    serializer.validated_data["index"],
                                    user=request.user)
            return Response({
                "detail":
                f"Skipped document with ID: {serializer.validated_data['document_id']}"
            })
Exemplo n.º 28
0
    def get_new_index_name(project_id: int, indices: List[str] = []):
        """
        Creates a name for the new index based on the number of documents already in the project-related indices.
        New name is given based on the number of indices matching the base name pattern.
        This prevents the indices from getting too large during production.
        """
        base_index_name = f"texta-{DEPLOY_KEY}-import-project-{project_id}"

        indices = DocumentImportView.get_indices_with_timestamp(base_index_name)
        sorted_indices = sorted(indices, reverse=True, key=lambda x: x["creation_date"])

        # if no indices exist for the pattern, use base name
        if not sorted_indices:
            return base_index_name
        # get last index name
        last_index_name = sorted_indices[0]["name"]
        # count documents in last index
        last_index_count = ElasticDocument(index=last_index_name).count()
        # compare count
        if last_index_count >= get_core_setting("TEXTA_ES_MAX_DOCS_PER_INDEX"):
            # generate new name based on number of existing indices
            new_index_name = f"{base_index_name}-{len(sorted_indices)}"
            return new_index_name
        return last_index_name
Exemplo n.º 29
0
    def setUp(self):
        self.user = create_test_user('user', '*****@*****.**', 'pw')
        self.index_uuid = uuid.uuid4().hex[:5]
        self.new_test_index_name = f"ttk_test_fields_tagger_{self.index_uuid}"

        self.ed = ElasticDocument(index=self.new_test_index_name)
        self.ed.core.es.indices.create(index=self.new_test_index_name, ignore=[400, 404])

        self.project = project_creation("SearchFieldsTaggerTestProject", self.new_test_index_name, self.user)
        self.project.users.add(self.user)
        self.url = reverse(f"{VERSION_NAMESPACE}:search_fields_tagger-list", kwargs={"project_pk": self.project.pk})

        self.uuid = uuid.uuid4().hex[:10]
        self.document = {
            "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.",
            "Field_2": "This is a different sentence.",
            "Field_3": "This is test data.",
            "newline_break": "olgu\nõnnistatud\npüha\nkäsikranaat",
            "array_break": ["olgu", "õnnistatud", "püha", "käsikranaat"],
            "uuid": self.uuid
        }

        self.ed.add(self.document)
        self.client.login(username='******', password='******')
Exemplo n.º 30
0
    def add_documents(self, request, *args, **kwargs):
        serializer = ClusteringIdsSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        clustering_pk = ClusterViewSet.__handle_clustering_pk(kwargs)
        clustering_obj = ClusteringResult.objects.get(pk=clustering_pk)
        cluster_obj = clustering_obj.cluster_result.get(pk=kwargs["pk"])
        indices = clustering_obj.get_indices()
        stop_words = json.loads(clustering_obj.stop_words)
        fields = json.loads(clustering_obj.fields)

        ed = ElasticDocument(indices)

        # Get full elasticsearch documents with id, index, type and source values.
        existing_documents: List[dict] = ed.get_bulk(
            serializer.validated_data["ids"])
        existing_documents: List[str] = [
            document["_id"] for document in existing_documents
        ]

        saved_documents = json.loads(cluster_obj.document_ids)
        unique_ids = list(set(existing_documents + saved_documents))
        cluster_obj.document_ids = json.dumps(unique_ids)

        # get texts of new documents
        new_documents = []
        phraser = None
        new_ids = [
            doc_id for doc_id in unique_ids if doc_id not in saved_documents
        ]
        if len(new_ids) > 0:
            indices = clustering_obj.get_indices()
            stop_words = json.loads(clustering_obj.stop_words)
            ignored_ids = json.loads(clustering_obj.ignored_ids)
            fields = json.loads(clustering_obj.fields)
            document_limit = clustering_obj.document_limit
            query = {"query": {"ids": {"values": new_ids}}}

            text_processor = TextProcessor(remove_stop_words=True,
                                           custom_stop_words=stop_words)
            elastic_search = ElasticSearcher(
                indices=indices,
                query=query,
                text_processor=text_processor,
                ignore_ids=set(ignored_ids),
                output=ElasticSearcher.OUT_TEXT_WITH_ID,
                field_data=fields,
                scroll_limit=document_limit)

            for doc_id, text in elastic_search:
                new_documents.append({"id": doc_id, "text": text})

            if clustering_obj.embedding:
                embedding = clustering_obj.embedding.get_embedding()
                embedding.load_django(clustering_obj.embedding)
                phraser = embedding.phraser
            else:
                phraser = None

        # Update the similarity score since the documents were changed.
        cc = ClusterContent(doc_ids=unique_ids,
                            vectors_filepath=clustering_obj.vector_model.path)
        cluster_obj.intracluster_similarity = float(
            cc.get_intracluster_similarity(new_documents=new_documents,
                                           phraser=phraser))

        # Update the significant words since the documents were changed.
        sw = Cluster.get_significant_words(indices=indices,
                                           fields=fields,
                                           document_ids=unique_ids,
                                           stop_words=stop_words)
        cluster_obj.significant_words = json.dumps(sw)

        cluster_obj.save()
        return Response({
            "message":
            str(len(new_ids)) +
            " new document(s) successfully added to the cluster!"
        })