Exemplo n.º 1
0
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int):
    """Apply Torch Tagger to index."""
    try:
        tagger_object = TorchTaggerObject.objects.get(pk=object_id)
        tagger = tagger_object.load_tagger()

        progress = ShowProgress(tagger_object.task)

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices = indices,
            field_data = fields + ["texta_facts"],  # Get facts to add upon existing ones.
            query = query,
            output = ElasticSearcher.OUT_RAW,
            timeout = f"{es_timeout}m",
            callback_progress=progress,
            scroll_size = bulk_size
        )

        actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger)
        for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3):
            if not success:
                logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

        tagger_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
Exemplo n.º 2
0
    def test_index_processing(self):
        query_string = "inimene"
        payload = {
            "description":
            "TestingIndexProcessing",
            "fields": [TEST_FIELD],
            "query":
            json.dumps(
                {'query': {
                    'match': {
                        'comment_content_lemmas': query_string
                    }
                }},
                ensure_ascii=False)
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output("test_index_processing:response.data", response.data)

        # Check if MLP was applied to the documents properly.
        s = ElasticSearcher(indices=[self.test_index_name],
                            output=ElasticSearcher.OUT_DOC,
                            query=payload["query"])
        for hit in s:
            self._assert_mlp_contents(hit, TEST_FIELD)
Exemplo n.º 3
0
    def test_that_split_index_with_nested_field_still_has_nested_field(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output(
            'test_that_split_index_with_nested_field_still_has_nested_field:response.data',
            response.data)
        at_least_once = False
        es = ElasticSearcher(
            indices=[INDEX_SPLITTING_TEST_INDEX, INDEX_SPLITTING_TEST_INDEX],
            field_data=[TEST_INDEX_OBJECT_FIELD],
            flatten=False)
        for item in es:
            data = item.get(TEST_INDEX_OBJECT_FIELD, None)
            if data:
                self.assertTrue(isinstance(data, dict))
                at_least_once = True
        self.assertTrue(at_least_once)
Exemplo n.º 4
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Exemplo n.º 5
0
 def _initialize_es(self, project_pk, text_processor, callback_progress,
                    prediction_to_match):
     # create es doc
     es_doc = ElasticDocument(self.feedback_index)
     # if no model objects, return nones for query and search
     if not self.model_object:
         return es_doc, None, None
     # create mathing query
     query = Query()
     query.add_string_filter(query_string=self.model_object.MODEL_TYPE,
                             fields=["model_type"])
     if self.model_object:
         query.add_string_filter(query_string=str(self.model_object.pk),
                                 fields=["model_id"])
     if prediction_to_match:
         query.add_string_filter(query_string=prediction_to_match,
                                 fields=["correct_result"])
     # if no index, don't create searcher object
     if not self.check_index_exists():
         return es_doc, None, query.query
     # create es search
     es_search = ElasticSearcher(indices=self.feedback_index,
                                 query=query.query,
                                 text_processor=text_processor,
                                 output=ElasticSearcher.OUT_DOC_WITH_ID,
                                 callback_progress=callback_progress)
     # return objects
     return es_doc, es_search, query.query
Exemplo n.º 6
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object = self.get_object()
        # check if tagger exists

        if not tagger_object.model.path:
            raise NonExistantModelError()

        if not tagger_object.model.path:
            return Response({'error': 'model does not exist (yet?)'},
                            status=status.HTTP_400_BAD_REQUEST)

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = tagger_object.get_available_or_all_indices(indices)

        # retrieve tagger fields
        tagger_fields = json.loads(tagger_object.fields)
        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]

        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        # apply tagger
        tagger_response = apply_tagger(tagger_object.id,
                                       random_doc_filtered,
                                       input_type='doc')
        response = {"document": random_doc, "prediction": tagger_response}
        return Response(response, status=status.HTTP_200_OK)
Exemplo n.º 7
0
 def _get_split_documents_by_id(self, id_field, id_value, text_field):
     documents = []
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, field_data=[id_field, text_field], output=ElasticSearcher.OUT_RAW)
     for hit in es:
         for document in hit:
             documents.append(document)
     return documents
Exemplo n.º 8
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object: RegexTaggerGroup = self.get_object()

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve tagger fields
        fields = serializer.validated_data["fields"]
        if not ElasticCore().check_if_indices_exist(
                tagger_object.project.get_indices()):
            return Response(
                {
                    'error':
                    f'One or more index from {list(tagger_object.project.get_indices())} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply tagger
        results = {
            "tagger_group_id": tagger_object.pk,
            "tagger_group_tag": tagger_object.description,
            "result": False,
            "matches": [],
            "document": flattened_doc
        }

        final_matches = []
        for field in fields:
            text = flattened_doc.get(field, None)
            results["document"][field] = text
            matches = tagger_object.match_texts([text],
                                                as_texta_facts=True,
                                                field=field)

            if matches:
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Exemplo n.º 9
0
    def post(self, request, project_pk: int):
        """Simplified search interface for making Elasticsearch queries."""
        serializer = ProjectSimplifiedSearchSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        project_object = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project_object)
        project_indices = list(project_object.get_indices())
        project_fields = project_object.get_elastic_fields(path_list=True)
        # test if indices exist
        if not project_indices:
            raise ProjectValidationFailed(detail="Project has no indices")
        # test if indices are valid
        if serializer.validated_data['match_indices']:
            if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)):
                raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}")
        # test if fields are valid
        if serializer.validated_data['match_fields']:
            if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)):
                raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}")

        es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC)
        q = Query(operator=serializer.validated_data['operator'])
        # if input is string, convert to list
        # if unknown format, return error
        match_text = serializer.validated_data['match_text']
        if isinstance(match_text, list):
            match_texts = [str(item) for item in match_text if item]
        elif isinstance(match_text, str):
            match_texts = [match_text]
        else:
            return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST)
        # add query filters
        for item in match_texts:
            q.add_string_filter(item, match_type=serializer.validated_data["match_type"])
        # update query
        es.update_query(q.query)
        # retrieve results
        results = es.search(size=serializer.validated_data["size"])
        return Response(results, status=status.HTTP_200_OK)
Exemplo n.º 10
0
    def test_create_splitter_object_and_task_signal(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url,
                                    json.dumps(payload),
                                    content_type='application/json')

        print_output(
            'test_create_splitter_object_and_task_signal:response.data',
            response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])
        print_output("indices:", splitter_obj.get_indices())
        # Check if IndexSplitter object gets created
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        # Check if Task gets created
        self.assertTrue(splitter_obj.task is not None)
        print_output("status of IndexSplitter's Task object",
                     splitter_obj.task.status)
        # Check if Task gets completed
        self.assertEqual(splitter_obj.task.status, Task.STATUS_COMPLETED)

        sleep(5)

        original_count = ElasticSearcher(indices=self.test_index_name).count()
        test_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TEST_INDEX).count()
        train_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TRAIN_INDEX).count()

        print_output('original_count, test_count, train_count',
                     [original_count, test_count, train_count])
Exemplo n.º 11
0
class Autocomplete:

    def __init__(self, project, indices, limit = 10):
        self.project = project
        self.limit = limit
        self.es = ElasticSearcher(output=ElasticSearcher.OUT_RAW, indices=indices)

    def get_fact_names(self, startswith):
        query = {"aggs": {'fact': {"nested": {"path": "texta_facts"}, "aggs": {'fact': {"terms": {"field": "texta_facts.fact", "size": self.limit, "include": f"{startswith}.*"}}}}}}

        self.es.update_query(query)
        results = self.es.search()

        facts = [a['key'] for a in results['aggregations']['fact']['fact']['buckets']]

        return facts


    def get_fact_values(self, startswith, fact_name):
        query = {"aggs": {'str_val': {"nested": {"path": "texta_facts"}, "aggs": {'str_val': {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": f"{startswith}.*"}}}}}}}}

        self.es.update_query(query)
        results = self.es.search()

        facts = []
        for bucket in results['aggregations']['str_val']['str_val']['buckets']:
            if bucket['key'] == fact_name:
                facts += [sub_bucket['key'] for sub_bucket in bucket['fact_values']['buckets']]

        return facts

    def get_lexicons(self, startswith):
        # TODO
        pass
Exemplo n.º 12
0
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore):
    index_elastic_search = ElasticSearcher(
        indices=indices,
        field_data=index_fields,
        callback_progress=show_progress,
        query=query,
        output=ElasticSearcher.OUT_RAW,
        scroll_size=scroll_size
    )
    index_actions = add_doc_uuid(generator=index_elastic_search)
    for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3):
        if not success:
            logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
Exemplo n.º 13
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemplo n.º 14
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Exemplo n.º 15
0
    def test_create_random_split(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url, data=payload)
        print_output('test_create_random_split:response.data', response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])

        # Assert Task gets completed
        self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED)
        print_output("Task status", Task.STATUS_COMPLETED)

        sleep(5)

        original_count = ElasticSearcher(indices=self.test_index_name).count()
        test_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TEST_INDEX).count()
        train_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TRAIN_INDEX).count()

        print_output('original_count, test_count, train_count',
                     [original_count, test_count, train_count])
        # To avoid any inconsistencies caused by rounding assume sizes are between small limits
        self.assertTrue(self.is_between_limits(test_count, original_count,
                                               0.2))
        self.assertTrue(
            self.is_between_limits(train_count, original_count, 0.8))
Exemplo n.º 16
0
 def test_applying_lang_detect_with_query(self):
     mlp_field = f"{TEST_FIELD}_mlp"
     query_string = "inimene"
     payload = {
         "description": "TestingIndexProcessing",
         "field": TEST_FIELD,
         "query": json.dumps({'query': {'match': {'comment_content_lemmas': query_string}}}, ensure_ascii=False)
     }
     response = self.client.post(self.url, data=payload, format="json")
     print_output("test_applying_lang_detect_with_query:response.data", response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"]))
     for hit in s:
         if TEST_FIELD in hit:
             self.assertTrue(f"{mlp_field}.language.detected" in hit)
             lang_value = hit[f"{mlp_field}.language.detected"]
             self.assertTrue(lang_value == "et")
Exemplo n.º 17
0
def apply_crf_extractor_to_index(object_id: int, indices: List[str],
                                 mlp_fields: List[str], label_suffix: str,
                                 query: dict, bulk_size: int,
                                 max_chunk_bytes: int, es_timeout: int):
    """
    Applies Extractor to ES index.
    """
    try:
        # load model
        crf_object = CRFExtractorObject.objects.get(pk=object_id)
        extractor = crf_object.load_extractor()
        # progress
        progress = ShowProgress(crf_object.task)
        # add fact field if missing
        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]
        # search
        searcher = ElasticSearcher(
            indices=indices,
            field_data=mlp_fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            output=ElasticSearcher.OUT_RAW,
            timeout=f"{es_timeout}m",
            callback_progress=progress,
            scroll_size=bulk_size)
        # create update actions
        actions = update_generator(generator=searcher,
                                   ec=ec,
                                   mlp_fields=mlp_fields,
                                   label_suffix=label_suffix,
                                   object_id=object_id,
                                   extractor=extractor)
        # perform updates
        try:
            # as we have defined indices in actions there is no need to do it again (None)
            ElasticDocument(None).bulk_update(actions)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        # all done
        crf_object.task.complete()
        return True

    except Exception as e:
        crf_object.task.handle_failed_task(e)
        raise e
Exemplo n.º 18
0
    def extract_from_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get rakun object
        rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk)

        serializer = RakunExtractorRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [index["name"] for index in serializer.validated_data["indices"]]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve rakun fields
        fields = serializer.validated_data["fields"]

        # retrieve param add_spans
        add_spans = serializer.validated_data["add_spans"]

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply rakun
        results = {
            "rakun_id": rakun_object.pk,
            "description": rakun_object.description,
            "result": False,
            "keywords": [],
            "document": flattened_doc
        }
        final_keywords = []
        keyword_detector = rakun_object.load_rakun_keyword_detector()
        for field in fields:
            text = flattened_doc.get(field, "")
            results["document"][field] = text
            keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans)

            if keywords:
                final_keywords.extend(keywords)
                results["result"] = True

        results["keywords"] = final_keywords
        return Response(results, status=status.HTTP_200_OK)
Exemplo n.º 19
0
 def test_processing_with_just_tokenizer(self):
     payload = {
         "description": "hello there, kenobi.",
         "fields": [TEST_FIELD],
         "analyzers": ["tokenizer"],
         "indices": [{
             "name": self.test_index_name
         }],
         "query": json.dumps(TEST_QUERY, ensure_ascii=False)
     }
     response = self.client.post(self.list_url, data=payload, format="json")
     print_output("test_processing_with_just_tokenizer:response.data",
                  response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name], query=TEST_QUERY)
     for hit in s:
         new_field = f'{TEST_FIELD}_es.tokenized_text'
         self.assertTrue(new_field in hit)
         self.assertTrue(hit[new_field] != hit[TEST_FIELD])
Exemplo n.º 20
0
 def _get_negatives(self, size):
     self.show_progress.update_step("scrolling negative sample")
     self.show_progress.update_view(0)
     # iterator for retrieving negative examples
     negative_sample_iterator = ElasticSearcher(
         indices=self.indices,
         field_data=self.field_data,
         output=ElasticSearcher.OUT_DOC,
         callback_progress=self.show_progress,
         text_processor=self.text_processor,
         scroll_limit=int(size *
                          float(self.tagger_object.negative_multiplier)),
         ignore_ids=self.ignore_ids,
     )
     # iterator to list
     negative_sample = list(negative_sample_iterator)
     # document doct to value string if asked
     if self.join_fields:
         negative_sample = self._join_fields(negative_sample)
     return negative_sample
Exemplo n.º 21
0
 def test_normal_process_application(self):
     payload = {
         "description": "hello there, kenobi.",
         "analyzers": ["stemmer"],
         "fields": [TEST_FIELD],
         "stemmer_lang": "estonian",
         "indices": [{
             "name": self.test_index_name
         }]
     }
     response = self.client.post(self.list_url, data=payload, format="json")
     print_output("test_normal_process_application:response.data",
                  response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name])
     for hit in s:
         new_field = f'{TEST_FIELD}_es.stems'
         self.assertTrue(new_field in hit)
         self.assertTrue(hit[new_field] != hit[TEST_FIELD])
         break
Exemplo n.º 22
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemplo n.º 23
0
    def _get_class_sample(self, query, class_name):
        """Returns sample for given class"""
        # limit the docs according to max sample size & feedback size
        limit = int(self.tagger_object.maximum_sample_size)

        if class_name in self.feedback:
            limit = limit - len(self.feedback[class_name])

        logging.getLogger(INFO_LOGGER).info(
            f"Collecting examples for class {self.class_display_name} (max limit = {limit})..."
        )
        # iterator for retrieving positive sample by query
        positive_sample_iterator = ElasticSearcher(
            query=query,
            indices=self.indices,
            field_data=self.field_data,
            output=ElasticSearcher.OUT_DOC_WITH_ID,
            callback_progress=self.show_progress,
            scroll_limit=limit,
            text_processor=self.text_processor)
        positive_sample = []
        # set positive ids to ignore while scrolling for negatives
        for doc in positive_sample_iterator:
            self.ignore_ids.add(doc["_id"])
            # remove id from doc
            del doc["_id"]
            positive_sample.append(doc)

        logging.getLogger(INFO_LOGGER).info(
            f"Found {len(positive_sample)} examples for {self.class_display_name}..."
        )

        # If class balancing is enabled, modify number of required samples
        if self.balance:
            positive_sample = self._duplicate_examples(positive_sample,
                                                       class_name, limit)

        # document doct to value string if asked
        if self.join_fields:
            positive_sample = self._join_fields(positive_sample)
        return positive_sample
Exemplo n.º 24
0
def apply_summarizer_on_index(self, summarizer_id: int):
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    task_object = summarizer_object.task
    try:
        load_sumy()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling summarizer')

        # Get the necessary fields.
        indices: List[str] = summarizer_object.get_indices()
        field_data: List[str] = json.loads(summarizer_object.fields)
        ratio_data: float[str] = summarizer_object.ratio
        algorithm_data: List[str] = summarizer_object.algorithm

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(summarizer_object.query),
                                   indices=indices,
                                   field_data=field_data,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="30m")

        actions = process_actions(searcher,
                                  field_data,
                                  ratio_data,
                                  algorithm=algorithm_data,
                                  summarizer_class=sumy,
                                  summarizer_id=summarizer_id)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return summarizer_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemplo n.º 25
0
    def test_that_lang_detect_enters_nan_token_on_bogus_fields(self):
        # Set up the index with the target document that ensures NAN response.
        ec = ElasticCore()
        query_string = 159784984949
        document_id = "test_that_lang_detect_enters_nan_token_on_bogus_fields"
        ec.es.index(index=self.test_index_name, id=document_id, body={TEST_FIELD: query_string}, refresh="wait_for")

        payload = {
            "description": "TestingIndexProcessing",
            "field": TEST_FIELD,
            "query": json.dumps({'query': {'match': {TEST_FIELD: query_string}}}, ensure_ascii=False)
        }
        response = self.client.post(self.url, data=payload, format="json")
        print_output("test_that_lang_detect_enters_nan_token_on_bogus_fields:response.data", response.data)
        self.assertTrue(response.status_code == status.HTTP_201_CREATED)

        s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"]))
        for hit in s:
            self.assertTrue(hit[f"{TEST_FIELD}_mlp.language.detected"] == NAN_LANGUAGE_TOKEN_KEY)
            break

        # Clean up the document from the index.
        ec.es.delete(index=self.test_index_name, id=document_id, refresh="wait_for")
Exemplo n.º 26
0
    def test_automatic_lang_detection_process(self):
        payload = {
            "description": "hello there, kenobi.",
            "fields": [TEST_FIELD],
            "analyzers": ["stemmer", "tokenizer"],
            "detect_lang": True,
            "indices": [{
                "name": self.test_index_name
            }]
        }
        response = self.client.post(self.list_url, data=payload, format="json")
        print_output("test_automatic_lang_detection_process:response.data",
                     response.data)
        self.assertTrue(response.status_code == status.HTTP_201_CREATED)

        s = ElasticSearcher(indices=[self.test_index_name])
        for hit in s:
            fields = [
                f'{TEST_FIELD}_es.tokenized_text', f'{TEST_FIELD}_es.stems'
            ]
            self.assertTrue(all([field in hit for field in fields]))
            self.assertTrue(
                all(hit[field] != hit[TEST_FIELD] for field in fields))
            break
Exemplo n.º 27
0
    def post(self, request, project_pk: int):
        """Executes **raw** Elasticsearch query on all project indices."""
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        serializer = ProjectSearchByQuerySerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])

        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        es = None
        if serializer.validated_data["output_type"]:
            es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"])
        else:
            es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS)

        es.update_query(serializer.validated_data["query"])
        results = es.search()
        return Response(results, status=status.HTTP_200_OK)
Exemplo n.º 28
0
def evaluate_tags_task(object_id: int,
                       indices: List[str],
                       query: dict,
                       es_timeout: int = 10,
                       scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting evaluator task for Evaluator with ID {object_id}.")

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        # Retreieve facts and sklearn average function from the model
        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact
        true_fact_value = evaluator_object.true_fact_value
        pred_fact_value = evaluator_object.predicted_fact_value

        average = evaluator_object.average_function
        add_individual_results = evaluator_object.add_individual_results

        searcher = ElasticSearcher(indices=indices,
                                   field_data=["texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Binary
        if true_fact_value and pred_fact_value:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})."
            )

            # Set the evaluation type in the model
            evaluator_object.evaluation_type = "binary"

            true_set = {true_fact_value, "other"}
            pred_set = {pred_fact_value, "other"}

            classes = ["other", true_fact_value]
            n_total_classes = len(classes)

        # Multilabel/multiclass
        else:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'."
            )

            # Make deepcopy of the query to avoid modifying Searcher's query.
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))

            # Get all fact values corresponding to true and predicted facts to construct total set of labels
            # needed for confusion matrix, individual score calculations and memory imprint calculations
            true_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=true_fact)
            pred_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=pred_fact)

            true_set = set(true_fact_values)
            pred_set = set(pred_fact_values)

            classes = list(true_set.union(pred_set))
            n_total_classes = len(classes)

            # Add dummy classes for missing labels
            classes.extend(
                [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL])

            ## Set the evaluation type in the model
            evaluator_object.evaluation_type = "multilabel"

            classes.sort(key=lambda x: x[0].lower())

        # Get number of documents in the query to estimate memory imprint
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Number of documents: {n_docs} | Number of classes: {len(classes)}"
        )

        # Get the memory buffer value from core variables
        core_memory_buffer_value_gb = get_core_setting(
            "TEXTA_EVALUATOR_MEMORY_BUFFER_GB")

        # Calculate the value based on given ratio if the core variable is empty
        memory_buffer_gb = calculate_memory_buffer(
            memory_buffer=core_memory_buffer_value_gb,
            ratio=EVALUATOR_MEMORY_BUFFER_RATIO,
            unit="gb")

        required_memory = get_memory_imprint(
            n_docs=n_docs,
            n_classes=len(classes),
            eval_type=evaluator_object.evaluation_type,
            unit="gb",
            int_size=64)
        enough_memory = is_enough_memory_available(
            required_memory=required_memory,
            memory_buffer=memory_buffer_gb,
            unit="gb")

        # Enable scoring after each scroll if there isn't enough memory
        # for calculating the scores for the whole set of documents at once.
        score_after_scroll = False if enough_memory else True

        # If scoring after each scroll is enabled and scores are averaged after each scroll
        # the results for each averaging function besides `micro` are imprecise
        scores_imprecise = True if (score_after_scroll
                                    and average != "micro") else False

        # Store document counts, labels' class counts and indicatior if scores are imprecise
        evaluator_object.document_count = n_docs
        evaluator_object.n_true_classes = len(true_set)
        evaluator_object.n_predicted_classes = len(pred_set)
        evaluator_object.n_total_classes = n_total_classes
        evaluator_object.scores_imprecise = scores_imprecise
        evaluator_object.score_after_scroll = score_after_scroll

        # Save model updates
        evaluator_object.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}"
        )

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        # Scroll and score tags
        scores, bin_scores = scroll_and_score(
            generator=searcher,
            evaluator_object=evaluator_object,
            true_fact=true_fact,
            pred_fact=pred_fact,
            true_fact_value=true_fact_value,
            pred_fact_value=pred_fact_value,
            classes=classes,
            average=average,
            score_after_scroll=score_after_scroll,
            n_batches=n_batches,
            add_individual_results=add_individual_results)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        confusion = scores["confusion_matrix"]
        confusion = np.asarray(confusion, dtype="int64")

        if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES:
            # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix
            confusion, classes = delete_empty_rows_and_cols(confusion, classes)

        scores["confusion_matrix"] = confusion.tolist()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        # Add final scores to the model
        evaluator_object.precision = scores["precision"]
        evaluator_object.recall = scores["recall"]
        evaluator_object.f1_score = scores["f1_score"]
        evaluator_object.accuracy = scores["accuracy"]
        evaluator_object.confusion_matrix = json.dumps(
            scores["confusion_matrix"])

        evaluator_object.individual_results = json.dumps(
            remove_not_found(bin_scores), ensure_ascii=False)
        evaluator_object.add_misclassified_examples = False

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
Exemplo n.º 29
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging a random document.
        """
        logging.getLogger(INFO_LOGGER).info(
            f"[Tag Random doc] Starting tag_random_doc...")
        # get hybrid tagger object
        hybrid_tagger_object = self.get_object()

        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()

        # retrieve tagger fields from the first object
        first_tagger = hybrid_tagger_object.taggers.first()
        tagger_fields = json.loads(first_tagger.fields)
        # error if redis not available

        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = first_tagger.get_available_or_all_indices(indices)

        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} does not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        tagger_group_id = self.get_object().pk

        # combine document field values into one string
        combined_texts = '\n'.join(random_doc_filtered.values())
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=False)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   random_doc_filtered,
                                   tag_candidates,
                                   request,
                                   input_type='doc')
        # return document with tags
        response = {"document": random_doc, "tags": tags}
        return Response(response, status=status.HTTP_200_OK)
Exemplo n.º 30
0
def evaluate_entity_tags_task(object_id: int,
                              indices: List[str],
                              query: dict,
                              es_timeout: int = 10,
                              scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting entity evaluator task for Evaluator with ID {object_id}."
        )

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact

        add_misclassified_examples = evaluator_object.add_misclassified_examples
        token_based = evaluator_object.token_based

        # If the user hasn't defined a field, retrieve it automatically
        if not evaluator_object.field:
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))
            true_fact_doc_paths = es_aggregator.facts_abstract(
                key_field="fact",
                value_field="doc_path",
                filter_by_key=true_fact)
            doc_path = true_fact_doc_paths[0]
        else:
            doc_path = evaluator_object.field

        searcher = ElasticSearcher(indices=indices,
                                   field_data=[doc_path, "texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Get number of documents
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        evaluator_object.document_count = n_docs
        evaluator_object.scores_imprecise = False
        evaluator_object.score_after_scroll = False
        evaluator_object.add_individual_results = False

        # Save model updates
        evaluator_object.save()

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        scores, misclassified = scroll_and_score_entity(
            searcher, evaluator_object, true_fact, pred_fact, doc_path,
            token_based, n_batches, add_misclassified_examples)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        classes = ["other", true_fact]
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)