def apply_mlp_on_es_docs(self, source_and_meta_docs: List[str], mlp_id: int): """ Applies MLP on documents received by previous tasks and updates them in Elasticsearch. :param self: Reference to the Celery Task object of this task, courtesy of the bind parameter in the decorator. :param source_and_meta_docs: List of Elasticsearch document ID's to pull from Elasticsearch. :param mlp_id: ID of the MLPObject which contains progress. """ mlp_object = get_mlp_object(mlp_id) task_object = mlp_object.task # Get the necessary fields. field_data: List[str] = json.loads(mlp_object.fields) if TEXTA_TAGS_KEY not in field_data: # Add in existing facts so that proper duplicate filtering would be applied. field_data.append(TEXTA_TAGS_KEY) analyzers: List[str] = json.loads(mlp_object.analyzers) # retrieve document from ES document_wrapper = ElasticDocument(index=None) source_and_meta_docs = document_wrapper.get_bulk( doc_ids=source_and_meta_docs, fields=field_data) source_documents = [doc["_source"] for doc in source_and_meta_docs] mlp_docs = apply_mlp_on_documents(source_documents, analyzers, field_data, mlp_id) es_documents = unite_source_with_meta(source_and_meta_docs, mlp_docs) update_documents_in_es(es_documents) # Update progress task_object.update_progress_iter(len(source_and_meta_docs)) return True
def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""): """ This is a helper function to parse all the given fields and use the document_ids as input to make a significant_words aggregation. Args: exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations. stop_words: Optional parameter to remove stopwords from the results. indices: Indices from which to perform the aggregation. fields: From which fields can you get the text content needed for comparison. document_ids: IDs of the documents you want to use as baseline for the aggregation. Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents. """ ed = ElasticDocument("*") ea = ElasticAggregator(indices=indices) stop_words = StopWords._get_stop_words(custom_stop_words=stop_words) # Validate that those documents exist. validated_docs: List[dict] = ed.get_bulk(document_ids) if validated_docs: unique_ids = list(set([index["_id"] for index in validated_docs])) significant_words = [] for field in fields: sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude) significant_words += sw return significant_words else: return []
def retrieve(self, request, *args, **kwargs): # API v1 to v2 compliance if "clustering_pk" in self.kwargs: topic_analyzer_pk = self.kwargs["clustering_pk"] elif "topic_analyzer_pk" in self.kwargs: topic_analyzer_pk = self.kwargs["topic_analyzer_pk"] queryset = Cluster.objects.filter( clusteringresult__project__pk=self.kwargs["project_pk"], clusteringresult__pk=topic_analyzer_pk) cluster = get_object_or_404(queryset, pk=self.kwargs["pk"]) doc_ids = json.loads(cluster.document_ids) fields = json.loads(cluster.fields) indices = json.loads(cluster.indices) significant_words = json.loads(cluster.significant_words) display_fields = json.loads(cluster.display_fields) if display_fields: fields += display_fields ed = ElasticDocument(index=",".join(indices)) documents = ed.get_bulk(doc_ids, flatten=True) documents = documents if documents else [] documents = [{ "id": doc["_id"], "index": doc["_index"], "content": doc["_source"] } for doc in documents] formated_cluster = { "id": cluster.pk, "intracluster_similarity": cluster.intracluster_similarity, "document_count": cluster.get_document_count(), "significant_words": significant_words, "documents": documents } return Response(formated_cluster)
def update(self, request, *args, **kwargs): serializer = ClusterSerializer(data=request.data, partial=True) serializer.is_valid() cluster = Cluster.objects.get(pk=kwargs["pk"]) clustering_object = ClusteringResult.objects.get( pk=kwargs["topic_analyzer_pk"]) fields = json.loads(cluster.fields) stop_words = json.loads(clustering_object.stop_words) indices = json.loads(cluster.indices) if "document_ids" in serializer.validated_data: document_ids = serializer.validated_data["document_ids"] ed = ElasticDocument("*") # Validate that those documents exist. validated_docs = ed.get_bulk(document_ids) if validated_docs: unique_ids = list( set([index["_id"] for index in validated_docs])) cluster.document_ids = json.dumps(unique_ids) sw = Cluster.get_significant_words(indices=indices, fields=fields, document_ids=unique_ids, stop_words=stop_words) cluster.significant_words = json.dumps(sw) cluster_content = ClusterContent( unique_ids, vectors_filepath=clustering_object.vector_model.name) cluster.intracluster_similarity = cluster_content.get_intracluster_similarity( ) else: cluster.document_ids = json.dumps([]) cluster.save() return Response({"message": "Cluster has been updated successfully!"})
def add_documents(self, request, *args, **kwargs): serializer = ClusteringIdsSerializer(data=request.data) serializer.is_valid(raise_exception=True) clustering_pk = ClusterViewSet.__handle_clustering_pk(kwargs) clustering_obj = ClusteringResult.objects.get(pk=clustering_pk) cluster_obj = clustering_obj.cluster_result.get(pk=kwargs["pk"]) indices = clustering_obj.get_indices() stop_words = json.loads(clustering_obj.stop_words) fields = json.loads(clustering_obj.fields) ed = ElasticDocument(indices) # Get full elasticsearch documents with id, index, type and source values. existing_documents: List[dict] = ed.get_bulk( serializer.validated_data["ids"]) existing_documents: List[str] = [ document["_id"] for document in existing_documents ] saved_documents = json.loads(cluster_obj.document_ids) unique_ids = list(set(existing_documents + saved_documents)) cluster_obj.document_ids = json.dumps(unique_ids) # get texts of new documents new_documents = [] phraser = None new_ids = [ doc_id for doc_id in unique_ids if doc_id not in saved_documents ] if len(new_ids) > 0: indices = clustering_obj.get_indices() stop_words = json.loads(clustering_obj.stop_words) ignored_ids = json.loads(clustering_obj.ignored_ids) fields = json.loads(clustering_obj.fields) document_limit = clustering_obj.document_limit query = {"query": {"ids": {"values": new_ids}}} text_processor = TextProcessor(remove_stop_words=True, custom_stop_words=stop_words) elastic_search = ElasticSearcher( indices=indices, query=query, text_processor=text_processor, ignore_ids=set(ignored_ids), output=ElasticSearcher.OUT_TEXT_WITH_ID, field_data=fields, scroll_limit=document_limit) for doc_id, text in elastic_search: new_documents.append({"id": doc_id, "text": text}) if clustering_obj.embedding: embedding = clustering_obj.embedding.get_embedding() embedding.load_django(clustering_obj.embedding) phraser = embedding.phraser else: phraser = None # Update the similarity score since the documents were changed. cc = ClusterContent(doc_ids=unique_ids, vectors_filepath=clustering_obj.vector_model.path) cluster_obj.intracluster_similarity = float( cc.get_intracluster_similarity(new_documents=new_documents, phraser=phraser)) # Update the significant words since the documents were changed. sw = Cluster.get_significant_words(indices=indices, fields=fields, document_ids=unique_ids, stop_words=stop_words) cluster_obj.significant_words = json.dumps(sw) cluster_obj.save() return Response({ "message": str(len(new_ids)) + " new document(s) successfully added to the cluster!" })