예제 #1
0
    def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""):
        """
        This is a helper function to parse all the given fields and use the document_ids
        as input to make a significant_words aggregation.
        Args:
            exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations.
            stop_words: Optional parameter to remove stopwords from the results.
            indices: Indices from which to perform the aggregation.
            fields: From which fields can you get the text content needed for comparison.
            document_ids: IDs of the documents you want to use as baseline for the aggregation.

        Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents.

        """
        ed = ElasticDocument("*")
        ea = ElasticAggregator(indices=indices)

        stop_words = StopWords._get_stop_words(custom_stop_words=stop_words)
        # Validate that those documents exist.
        validated_docs: List[dict] = ed.get_bulk(document_ids)
        if validated_docs:
            unique_ids = list(set([index["_id"] for index in validated_docs]))
            significant_words = []
            for field in fields:
                sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude)
                significant_words += sw

            return significant_words
        else:
            return []
예제 #2
0
    def post(self, request, project_pk: int):
        """
        Returns existing fact names and values from Elasticsearch.
        """
        serializer = ProjectFactAggregatorSerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = serializer.validated_data["indices"]
        indices = [index["name"] for index in indices]

        # retrieve and validate project indices
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        project_indices = project.get_available_or_all_project_indices(indices)  # Gives all if n   one, the default, is entered.

        if not project_indices:
            return Response([])

        key_field = serializer.validated_data["key_field"]
        value_field = serializer.validated_data["value_field"]
        filter_by_key = serializer.validated_data["filter_by_key"]
        max_count = serializer.validated_data["max_count"]
        query = serializer.validated_data["query"]

        if isinstance(query, str):
            query = json.loads(query)

        aggregator = ElasticAggregator(indices=project_indices, query=query)
        results = aggregator.facts_abstract(key_field=key_field, value_field=value_field, filter_by_key=filter_by_key, size=max_count)

        return Response(results, status=status.HTTP_200_OK)
예제 #3
0
def validate_pos_label(data):
    """ For Tagger, TorchTagger and BertTagger.
    Checks if the inserted pos label is present in the fact values.
    """

    fact_name = data.get("fact_name")

    # If fact name is not selected, the value for pos label doesn't matter
    if not fact_name:
        return data

    indices = [index.get("name") for index in data.get("indices")]
    pos_label = data.get("pos_label")
    serializer_query = data.get("query")

    try:
        # If query is passed as a JSON string
        query = json.loads(serializer_query)
    except Exception as e:
        # if query is passed as a JSON dict
        query = serializer_query

    ag = ElasticAggregator(indices=indices, query=query)
    fact_values = ag.facts(size=10, filter_by_fact_name=fact_name, include_values=True)

    # If there exists exactly two possible values for the selected fact, check if pos label
    # is selected and if it is present in corresponding fact values.
    if len(fact_values) == 2:
        if not pos_label:
            raise ValidationError(f"The fact values corresponding to the selected query and fact '{fact_name}' are binary. You must specify param 'pos_label' for evaluation purposes. Allowed values for 'pos_label' are: {fact_values}")
        elif pos_label not in fact_values:
            raise ValidationError(f"The specified pos label '{pos_label}' is NOT one of the fact values for fact '{fact_name}'. Please select an existing fact value. Allowed fact values are: {fact_values}")
    return data
예제 #4
0
def validate_fact(indices: List[str], query: dict, fact: str):
    """ Check if given fact exists in the selected indices. """
    ag = ElasticAggregator(indices=indices, query=deepcopy(query))
    fact_values = ag.get_fact_values_distribution(
        fact, fact_name_size=choices.DEFAULT_MAX_FACT_AGGREGATION_SIZE)
    if not fact_values:
        raise ValidationError(
            f"Fact '{fact}' not present in any of the selected indices ({indices})."
        )
    return True
예제 #5
0
 def _get_tags(self, fact_name, min_count=50, max_count=None, query={}):
     """Finds possible tags for training by aggregating active project's indices."""
     active_indices = self.tagger_object.get_indices()
     es_a = ElasticAggregator(indices=active_indices, query=query)
     # limit size to 10000 unique tags
     tag_values = es_a.facts(filter_by_fact_name=fact_name,
                             min_count=min_count,
                             max_count=max_count,
                             size=10000)
     return tag_values
예제 #6
0
    def post(self, request, project_pk: int):
        """
        Returns existing fact names and values from Elasticsearch.
        """
        serializer = ProjectGetFactsSerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = serializer.validated_data["indices"]
        indices = [index["name"] for index in indices]

        # retrieve and validate project indices
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        project_indices = project.get_available_or_all_project_indices(indices)  # Gives all if n   one, the default, is entered.

        if not project_indices:
            return Response([])

        vals_per_name = serializer.validated_data['values_per_name']
        include_values = serializer.validated_data['include_values']
        fact_name = serializer.validated_data['fact_name']
        include_doc_path = serializer.validated_data['include_doc_path']
        exclude_zero_spans = serializer.validated_data['exclude_zero_spans']
        mlp_doc_path = serializer.validated_data['mlp_doc_path']

        aggregator = ElasticAggregator(indices=project_indices)

        if mlp_doc_path and exclude_zero_spans:
            # If exclude_zerp_spans is enabled and mlp_doc_path specified, the other values don't have any effect -
            # this behaviour might need to change at some point
            fact_map = aggregator.facts(size=1, include_values=True, include_doc_path=True, exclude_zero_spans=exclude_zero_spans)

        else:
            fact_map = aggregator.facts(size=vals_per_name, include_values=include_values, filter_by_fact_name=fact_name, include_doc_path=include_doc_path, exclude_zero_spans=exclude_zero_spans)

        if fact_name:
            fact_map_list = [v for v in fact_map]

        elif mlp_doc_path and exclude_zero_spans:
            # Return only fact names where doc_path contains mlp_doc_path as a parent field and facts have spans.
            # NB! Doesn't take into account the situation where facts have the same name, but different doc paths! Could happen!
            fact_map_list = [k for k, v in fact_map.items() if v and mlp_doc_path == v[0]["doc_path"].rsplit(".", 1)[0]]

        elif include_values:
            fact_map_list = [{'name': k, 'values': v} for k, v in fact_map.items()]
        else:
            fact_map_list = [v for v in fact_map]
        return Response(fact_map_list, status=status.HTTP_200_OK)
예제 #7
0
    def test_create_custom_split(self):
        custom_distribution = {"FUBAR": 10, "bar": 15}
        payload = {
            "description": "Original index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "custom",
            "fact": self.FACT,
            "custom_distribution": json.dumps(custom_distribution)
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output('test_create_custom_split:response.data', response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])

        # Assert Task gets completed
        self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED)
        print_output("Task status", Task.STATUS_COMPLETED)

        sleep(5)

        original_distribution = ElasticAggregator(
            indices=self.test_index_name).get_fact_values_distribution(
                self.FACT)
        test_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution(
                self.FACT)
        train_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution(
                self.FACT)

        print_output(
            'original_dist, test_dist, train_dist',
            [original_distribution, test_distribution, train_distribution])

        for label, quant in custom_distribution.items():
            self.assertEqual(test_distribution[label],
                             min(quant, original_distribution[label]))

        for label in original_distribution.keys():
            if label not in custom_distribution:
                self.assertTrue(label not in test_distribution)
                self.assertTrue(original_distribution[label],
                                train_distribution[label])
예제 #8
0
 def get_tags(self,
              fact_name,
              active_project,
              min_count=1000,
              max_count=None,
              indices=None):
     """Finds possible tags for training by aggregating active project's indices."""
     active_indices = list(
         active_project.get_indices()) if indices is None else indices
     es_a = ElasticAggregator(indices=active_indices)
     # limit size to 10000 unique tags
     tag_values = es_a.facts(filter_by_fact_name=fact_name,
                             min_count=min_count,
                             max_count=max_count,
                             size=10000)
     return tag_values
예제 #9
0
    def run_apply_crf_to_index_with_specified_label_suffix(self):
        """Tests applying extractor to index with specified label suffix using apply_to_index endpoint."""
        test_tagger_id = self.test_crf_ids[0]
        url = f'{self.url}{test_tagger_id}/apply_to_index/'
        label_suffix = "CRF_TEST"
        payload = {
            "description": "apply crf test task",
            "mlp_fields": ["text_mlp"],
            "indices": [{
                "name": self.test_index_copy
            }],
            "label_suffix": label_suffix
        }
        response = self.client.post(url, payload, format='json')
        print_output('test_apply_crf_to_index:response.data', response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = CRFExtractor.objects.get(pk=test_tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_crf_to_index_with_specified_label_suffix: waiting for applying tagger task to finish, current status:',
                tagger_object.task.status)
            sleep(2)

        results = ElasticAggregator(
            indices=[self.test_index_copy]).get_fact_values_distribution(
                f"GPE_{label_suffix}")
        print_output(
            "test_apply_crf_to_index_with_specified_label_suffix:elastic aggerator results:",
            results)

        # assert we have more facts than before
        self.assertTrue(len(results) > 1)
예제 #10
0
def validate_fact_value(indices: List[str], query: dict, fact: str,
                        fact_value: str):
    """ Check if given fact value exists under given fact. """
    # Fact value is allowed to be empty
    if not fact_value:
        return True

    ag = ElasticAggregator(indices=indices, query=deepcopy(query))

    fact_values = ag.facts(size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                           filter_by_fact_name=fact,
                           include_values=True)
    if fact_value not in fact_values:
        raise ValidationError(
            f"Fact value '{fact_value}' not in the list of fact values for fact '{fact}'."
        )
    return True
예제 #11
0
def validate_evaluation_type(indices: List[str], query: dict,
                             evaluation_type: str, true_fact: str,
                             pred_fact: str, true_value: str, pred_value: str):
    """ Checks if the chosen facts (and values) are applicable for the chosen evaluation type.
    """

    if evaluation_type == "binary":
        if not true_value or not pred_value:
            raise ValidationError(
                f"Please specify true and predicted values for evaluation type 'binary'."
            )
    #elif evaluation_type == "multilabel":
    #    if true_value or pred_value:
    #        raise ValidationError(f"Please leave true and predicted values unspeficied for evaluation type 'multilabel'.")
    elif evaluation_type == "entity":
        if true_value or pred_value:
            raise ValidationError(
                f"Please leave true and predicted values unspeficied for evaluation type 'entity'."
            )

        ag = ElasticAggregator(indices=indices, query=deepcopy(query))

        true_fact_results = ag.facts_abstract(key_field="fact",
                                              value_field="spans",
                                              filter_by_key=true_fact,
                                              size=5)
        pred_fact_results = ag.facts_abstract(key_field="fact",
                                              value_field="spans",
                                              filter_by_key=pred_fact,
                                              size=5)

        if len(true_fact_results) == 1:
            spans = json.loads(true_fact_results[0])
            if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0):
                raise ValidationError(
                    f"Did not find non-zero spans for selected true fact '{true_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'."
                )

        if len(pred_fact_results) == 1:
            spans = json.loads(pred_fact_results[0])
            if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0):
                raise ValidationError(
                    f"Did not find non-zero spans for selected predicted fact '{pred_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'."
                )

    return True
예제 #12
0
    def test_create_equal_split(self):
        payload = {
            "description": "Original index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "equal",
            "test_size": 20,
            "fact": self.FACT
        }

        response = self.client.post(self.url, data=payload)
        print_output('test_create_equal_split:response.data', response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])

        # Assert Task gets completed
        self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED)
        print_output("Task status", Task.STATUS_COMPLETED)

        sleep(5)

        original_distribution = ElasticAggregator(
            indices=self.test_index_name).get_fact_values_distribution(
                self.FACT)
        test_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution(
                self.FACT)
        train_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution(
                self.FACT)

        print_output(
            'original_dist, test_dist, train_dist',
            [original_distribution, test_distribution, train_distribution])

        for label, quant in original_distribution.items():
            if (quant > 20):
                self.assertEqual(test_distribution[label], 20)
                self.assertEqual(train_distribution[label], quant - 20)
            else:
                self.assertEqual(test_distribution[label], quant)
                self.assertTrue(label not in train_distribution)
예제 #13
0
    def test_create_original_split_fact_value_given(self):
        payload = {
            "description": "Original index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "original",
            "test_size": 20,
            "fact": self.FACT,
            "str_val": "FUBAR"
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output(
            'test_create_original_split_fact_value_given:response.data',
            response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])

        sleep(5)

        original_distribution = ElasticAggregator(
            indices=self.test_index_name).get_fact_values_distribution(
                self.FACT)
        test_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution(
                self.FACT)
        train_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution(
                self.FACT)

        print_output(
            'original_dist, test_dist, train_dist',
            [original_distribution, test_distribution, train_distribution])

        for label, quant in original_distribution.items():
            if label == "FUBAR":
                self.assertTrue(
                    self.is_between_limits(test_distribution[label], quant,
                                           0.2))
                self.assertTrue(
                    self.is_between_limits(train_distribution[label], quant,
                                           0.8))
예제 #14
0
    def _get_max_class_size(self) -> int:
        """Aggregates over values of the selected fact and returns the size of the largest class."""
        max_class_size = 0
        fact_name = self._get_fact_name()

        try:
            query = json.loads(self.tagger_object.query)
        except:
            query = self.tagger_object.query

        if fact_name:
            es_aggregator = ElasticAggregator(indices=self.indices,
                                              query=query)
            facts = es_aggregator.get_fact_values_distribution(
                fact_name=fact_name, fact_name_size=10, fact_value_size=10)
            logging.getLogger(INFO_LOGGER).info(f"Class frequencies: {facts}")
            max_class_size = max(facts.values())
        return max_class_size
예제 #15
0
def validate_entity_facts(indices: List[str], query: dict, true_fact: str,
                          pred_fact: str, doc_path: str):
    """ Check if facts chosen for entity evaluation follow all the necessary requirements. """

    ag = ElasticAggregator(indices=indices, query=deepcopy(query))

    true_fact_doc_paths = ag.facts_abstract(key_field="fact",
                                            value_field="doc_path",
                                            filter_by_key=true_fact)
    pred_fact_doc_paths = ag.facts_abstract(key_field="fact",
                                            value_field="doc_path",
                                            filter_by_key=pred_fact)

    if doc_path:
        if doc_path not in true_fact_doc_paths:
            raise ValidationError(
                f"The selected true_fact ('{true_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')."
            )

        if doc_path not in pred_fact_doc_paths:
            raise ValidationError(
                f"The selected predicted_fact ('{pred_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')."
            )

    if not doc_path:
        if set(true_fact_doc_paths) != set(pred_fact_doc_paths):
            raise ValidationError(
                f"The doc paths for true and predicted facts are different (true = {true_fact_doc_paths}; predicted = {pred_fact_doc_paths}). Please make sure you are evaluating facts based on the same fields."
            )

        if len(true_fact_doc_paths) > 1:
            raise ValidationError(
                f"Selected true fact ({true_fact}) is related to two or more fields {true_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'."
            )

        if len(pred_fact_doc_paths) > 1:
            raise ValidationError(
                f"Selected predicted fact ({pred_fact}) is related to two or more fields {pred_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'."
            )

    return True
예제 #16
0
    def run_apply_tagger_group_to_index(self):
        """Tests applying tagger group to index using apply_to_index endpoint."""
        # Make sure reindexer task has finished
        while self.reindexer_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_tagger_group_to_index: waiting for reindexer task to finish, current status:',
                self.reindexer_object.task.status)
            sleep(2)

        url = f'{self.url}{self.test_imported_tagger_group_id}/apply_to_index/'

        payload = {
            "description": "apply tagger test task",
            "new_fact_name": self.new_fact_name,
            "indices": [{
                "name": self.test_index_copy
            }],
            "fields": [TEST_FIELD],
            "lemmatize": False,
            "n_similar_docs": 10,
            "n_candidate_tags": 10
        }
        response = self.client.post(url, payload, format='json')
        print_output('test_apply_tagger_group_to_index:response.data',
                     response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_group_object = TaggerGroup.objects.get(
            pk=self.test_imported_tagger_group_id)

        # Wait til the task has finished
        while tagger_group_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_tagger_group_to_index: waiting for applying tagger task to finish, current status:',
                tagger_group_object.task.status)
            sleep(2)

        results = ElasticAggregator(
            indices=[self.test_index_copy]).get_fact_values_distribution(
                self.new_fact_name)
        print_output(
            "test_apply_tagger_group_to_index:elastic aggerator results:",
            results)

        # Check if at least one new fact is added
        self.assertTrue(len(results) >= 1)

        # clean
        imported_tagger_group = TaggerGroup.objects.get(
            id=self.test_imported_tagger_group_id)

        for tagger in imported_tagger_group.taggers.all():
            # Remove tagger files after test is done
            self.add_cleanup_files(tagger.id)
예제 #17
0
    def test_query_given(self):
        payload = {
            "description": "Original index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "original",
            "test_size": 20,
            "fact": self.FACT,
            "str_val": "bar",
            "query": json.dumps(TEST_QUERY)
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output('test_query_given:response.data', response.data)

        original_distribution = ElasticAggregator(
            indices=self.test_index_name).get_fact_values_distribution(
                self.FACT)
        test_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution(
                self.FACT)
        train_distribution = ElasticAggregator(
            indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution(
                self.FACT)

        print_output(
            'original_dist, test_dist, train_dist',
            [original_distribution, test_distribution, train_distribution])

        self.assertTrue("bar" in test_distribution)
        self.assertTrue("bar" in train_distribution)
        self.assertTrue("foo" not in train_distribution
                        and "foo" not in test_distribution)
        self.assertTrue("FUBAR" not in train_distribution
                        and "FUBAR" not in test_distribution)
예제 #18
0
    def run_apply_crf_to_index(self):
        """Tests applying extractor to index using apply_to_index endpoint."""
        test_tagger_id = self.test_crf_ids[0]
        url = f'{self.url}{test_tagger_id}/apply_to_index/'
        payload = {
            "description": "apply crf test task",
            "mlp_fields": ["text_mlp"],
            "indices": [{
                "name": self.test_index_copy
            }],
        }
        response = self.client.post(url, payload, format='json')
        print_output('test_apply_crf_to_index:response.data', response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = CRFExtractor.objects.get(pk=test_tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_crf_to_index: waiting for applying tagger task to finish, current status:',
                tagger_object.task.status)
            sleep(2)

        results_old = ElasticAggregator(
            indices=[self.test_index_name]).get_fact_values_distribution("GPE")
        print_output(
            "test_apply_crf_to_index_before:elastic aggerator results:",
            results_old)

        results_new = ElasticAggregator(
            indices=[self.test_index_copy]).get_fact_values_distribution("GPE")
        print_output(
            "test_apply_crf_to_index_after:elastic aggerator results:",
            results_new)

        # assert we have more facts than before
        for item in ["China", "Russia", "Iran"]:
            self.assertTrue(results_old[item] < results_new[item])
예제 #19
0
    def run_test_apply_tagger_to_index(self):
        """Tests applying tagger to index using apply_to_index endpoint."""

        # Make sure reindexer task has finished
        while self.reindexer_object.task.status != Task.STATUS_COMPLETED:
            print_output('[Regex Tagger] test_apply_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status)
            sleep(2)

        tagger_payload = {
            "description": "LOLL",
            "lexicon": ["loll"],
            "counter_lexicon": ["päris"]
        }

        response = self.client.post(self.url, tagger_payload)
        print_output('[Regex Tagger] new regex tagger for applying on the index:response.data', response.data)
        created_id = response.data['id']

        self.tagger_id = created_id
        url = f'{self.url}{self.tagger_id}/apply_to_index/'

        payload = {
            "description": "apply tagger test task",
            "indices": [{"name": self.test_index_copy}],
            "fields": [TEST_FIELD]
        }
        response = self.client.post(url, payload, format='json')
        print_output('[Regex Tagger] test_apply_tagger_to_index:response.data', response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = RegexTagger.objects.get(pk=self.tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output("tagger object:", tagger_object.to_json())
            print_output('[Regex Tagger] test_apply_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status)
            sleep(2)

        results = ElasticAggregator(indices=[self.test_index_copy]).get_fact_values_distribution("LOLL")
        print_output("[Regex Tagger] test_apply_tagger_to_index:elastic aggerator results:", results)

        # Check if expected number if new facts is added
        fact_value_1 = "loll"
        fact_value_2 = "lollikindel"
        n_fact_value_1 = 28
        n_fact_value_2 = 1

        self.assertTrue(fact_value_1 in results)
        self.assertTrue(fact_value_2 in results)
        self.assertTrue(results[fact_value_1] == n_fact_value_1)
        self.assertTrue(results[fact_value_2] == n_fact_value_2)
예제 #20
0
    def run_apply_multiclass_tagger_to_index(self):
        """Tests applying multiclass BERT tagger to index using apply_to_index endpoint."""
        # Make sure reindexer task has finished
        while self.reindexer_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_multiclass_bert_tagger_to_index: waiting for reindexer task to finish, current status:',
                self.reindexer_object.task.status)
            sleep(2)

        url = f'{self.url}{self.test_imported_multiclass_gpu_tagger_id}/apply_to_index/'

        payload = {
            "description": "apply bert tagger to index test task",
            "new_fact_name": self.new_multiclass_fact_name,
            "new_fact_value": self.new_fact_value,
            "indices": [{
                "name": self.test_index_copy
            }],
            "fields": TEST_FIELD_CHOICE
        }
        response = self.client.post(url, payload, format='json')
        print_output(
            'test_apply_multiclass_bert_tagger_to_index:response.data',
            response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = BertTaggerObject.objects.get(
            pk=self.test_imported_multiclass_gpu_tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_multiclass_bert_tagger_to_index: waiting for applying tagger task to finish, current status:',
                tagger_object.task.status)
            sleep(2)

        results = ElasticAggregator(
            indices=[self.test_index_copy]).get_fact_values_distribution(
                self.new_multiclass_fact_name)
        print_output(
            "test_apply_multiclass_bert_tagger_to_index:elastic aggerator results:",
            results)

        # Check if the expected facts and the expected number of them are added to the index
        expected_fact_value = "bar"
        expected_number_of_facts = 30
        self.assertTrue(expected_fact_value in results)
        self.assertTrue(
            results[expected_fact_value] == expected_number_of_facts)

        self.add_cleanup_files(self.test_imported_multiclass_gpu_tagger_id)
예제 #21
0
    def run_apply_multiclass_tagger_to_index(self):
        """Tests applying multiclass tagger to index using apply_to_index endpoint."""
        # Make sure reindexer task has finished
        while self.reindexer_object.task.status != Task.STATUS_COMPLETED:
            print_output('test_apply_multiclass_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status)
            sleep(2)

        test_tagger_id = self.test_imported_multiclass_tagger_id
        url = f'{self.url}{test_tagger_id}/apply_to_index/'

        payload = {
            "description": "apply multiclass tagger test task",
            "new_fact_name": self.new_fact_name,
            "indices": [{"name": self.test_index_copy}],
            "fields": TEST_FIELD_CHOICE,
            "lemmatize": False
        }
        response = self.client.post(url, payload, format='json')
        print_output('test_apply_multiclass_tagger_to_index:response.data', response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = Tagger.objects.get(pk=test_tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output('test_apply_mutliclass_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status)
            sleep(2)

        results = ElasticAggregator(indices=[self.test_index_copy]).get_fact_values_distribution(self.new_fact_name)
        print_output("test_apply_multiclass_tagger_to_index:elastic aggerator results:", results)

        # Check if applying the tagger results in at least 1 new fact
        self.assertTrue(len(results) >= 1)

        fact_value_1 = "bar"
        fact_value_2 = "foo"

        n_fact_value_1 = 18
        n_fact_value_2 = 12

        # Check if expected number of new facts is added to the index
        self.assertTrue(fact_value_1 in results)
        self.assertTrue(fact_value_2 in results)
        self.assertTrue(results[fact_value_1] == n_fact_value_1)
        self.assertTrue(results[fact_value_2] == n_fact_value_2)

        self.add_cleanup_files(test_tagger_id)
예제 #22
0
    def run_apply_binary_tagger_to_index(self):
        """Tests applying binary torch tagger to index using apply_to_index endpoint."""
        # Make sure reindexer task has finished
        while self.reindexer_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_binary_torch_tagger_to_index: waiting for reindexer task to finish, current status:',
                self.reindexer_object.task.status)
            sleep(2)

        url = f'{self.url}{self.test_tagger_id}/apply_to_index/'

        payload = {
            "description": "apply torch tagger to index test task",
            "new_fact_name": self.new_fact_name,
            "new_fact_value": self.new_fact_value,
            "indices": [{
                "name": self.test_index_copy
            }],
            "fields": TEST_FIELD_CHOICE
        }
        response = self.client.post(url, payload, format='json')
        print_output('test_apply_binary_torch_tagger_to_index:response.data',
                     response.data)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        tagger_object = TorchTagger.objects.get(pk=self.test_tagger_id)

        # Wait til the task has finished
        while tagger_object.task.status != Task.STATUS_COMPLETED:
            print_output(
                'test_apply_binary_torch_tagger_to_index: waiting for applying tagger task to finish, current status:',
                tagger_object.task.status)
            sleep(2)

        results = ElasticAggregator(
            indices=[self.test_index_copy]).get_fact_values_distribution(
                self.new_fact_name)
        print_output(
            "test_apply_binary_torch_tagger_to_index:elastic aggerator results:",
            results)

        # Check if expected number of facts is added
        self.assertTrue(results[self.new_fact_value] > 10)
예제 #23
0
def evaluate_entity_tags_task(object_id: int,
                              indices: List[str],
                              query: dict,
                              es_timeout: int = 10,
                              scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting entity evaluator task for Evaluator with ID {object_id}."
        )

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact

        add_misclassified_examples = evaluator_object.add_misclassified_examples
        token_based = evaluator_object.token_based

        # If the user hasn't defined a field, retrieve it automatically
        if not evaluator_object.field:
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))
            true_fact_doc_paths = es_aggregator.facts_abstract(
                key_field="fact",
                value_field="doc_path",
                filter_by_key=true_fact)
            doc_path = true_fact_doc_paths[0]
        else:
            doc_path = evaluator_object.field

        searcher = ElasticSearcher(indices=indices,
                                   field_data=[doc_path, "texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Get number of documents
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        evaluator_object.document_count = n_docs
        evaluator_object.scores_imprecise = False
        evaluator_object.score_after_scroll = False
        evaluator_object.add_individual_results = False

        # Save model updates
        evaluator_object.save()

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        scores, misclassified = scroll_and_score_entity(
            searcher, evaluator_object, true_fact, pred_fact, doc_path,
            token_based, n_batches, add_misclassified_examples)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        classes = ["other", true_fact]
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
예제 #24
0
def evaluate_tags_task(object_id: int,
                       indices: List[str],
                       query: dict,
                       es_timeout: int = 10,
                       scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting evaluator task for Evaluator with ID {object_id}.")

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        # Retreieve facts and sklearn average function from the model
        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact
        true_fact_value = evaluator_object.true_fact_value
        pred_fact_value = evaluator_object.predicted_fact_value

        average = evaluator_object.average_function
        add_individual_results = evaluator_object.add_individual_results

        searcher = ElasticSearcher(indices=indices,
                                   field_data=["texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Binary
        if true_fact_value and pred_fact_value:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})."
            )

            # Set the evaluation type in the model
            evaluator_object.evaluation_type = "binary"

            true_set = {true_fact_value, "other"}
            pred_set = {pred_fact_value, "other"}

            classes = ["other", true_fact_value]
            n_total_classes = len(classes)

        # Multilabel/multiclass
        else:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'."
            )

            # Make deepcopy of the query to avoid modifying Searcher's query.
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))

            # Get all fact values corresponding to true and predicted facts to construct total set of labels
            # needed for confusion matrix, individual score calculations and memory imprint calculations
            true_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=true_fact)
            pred_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=pred_fact)

            true_set = set(true_fact_values)
            pred_set = set(pred_fact_values)

            classes = list(true_set.union(pred_set))
            n_total_classes = len(classes)

            # Add dummy classes for missing labels
            classes.extend(
                [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL])

            ## Set the evaluation type in the model
            evaluator_object.evaluation_type = "multilabel"

            classes.sort(key=lambda x: x[0].lower())

        # Get number of documents in the query to estimate memory imprint
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Number of documents: {n_docs} | Number of classes: {len(classes)}"
        )

        # Get the memory buffer value from core variables
        core_memory_buffer_value_gb = get_core_setting(
            "TEXTA_EVALUATOR_MEMORY_BUFFER_GB")

        # Calculate the value based on given ratio if the core variable is empty
        memory_buffer_gb = calculate_memory_buffer(
            memory_buffer=core_memory_buffer_value_gb,
            ratio=EVALUATOR_MEMORY_BUFFER_RATIO,
            unit="gb")

        required_memory = get_memory_imprint(
            n_docs=n_docs,
            n_classes=len(classes),
            eval_type=evaluator_object.evaluation_type,
            unit="gb",
            int_size=64)
        enough_memory = is_enough_memory_available(
            required_memory=required_memory,
            memory_buffer=memory_buffer_gb,
            unit="gb")

        # Enable scoring after each scroll if there isn't enough memory
        # for calculating the scores for the whole set of documents at once.
        score_after_scroll = False if enough_memory else True

        # If scoring after each scroll is enabled and scores are averaged after each scroll
        # the results for each averaging function besides `micro` are imprecise
        scores_imprecise = True if (score_after_scroll
                                    and average != "micro") else False

        # Store document counts, labels' class counts and indicatior if scores are imprecise
        evaluator_object.document_count = n_docs
        evaluator_object.n_true_classes = len(true_set)
        evaluator_object.n_predicted_classes = len(pred_set)
        evaluator_object.n_total_classes = n_total_classes
        evaluator_object.scores_imprecise = scores_imprecise
        evaluator_object.score_after_scroll = score_after_scroll

        # Save model updates
        evaluator_object.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}"
        )

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        # Scroll and score tags
        scores, bin_scores = scroll_and_score(
            generator=searcher,
            evaluator_object=evaluator_object,
            true_fact=true_fact,
            pred_fact=pred_fact,
            true_fact_value=true_fact_value,
            pred_fact_value=pred_fact_value,
            classes=classes,
            average=average,
            score_after_scroll=score_after_scroll,
            n_batches=n_batches,
            add_individual_results=add_individual_results)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        confusion = scores["confusion_matrix"]
        confusion = np.asarray(confusion, dtype="int64")

        if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES:
            # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix
            confusion, classes = delete_empty_rows_and_cols(confusion, classes)

        scores["confusion_matrix"] = confusion.tolist()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        # Add final scores to the model
        evaluator_object.precision = scores["precision"]
        evaluator_object.recall = scores["recall"]
        evaluator_object.f1_score = scores["f1_score"]
        evaluator_object.accuracy = scores["accuracy"]
        evaluator_object.confusion_matrix = json.dumps(
            scores["confusion_matrix"])

        evaluator_object.individual_results = json.dumps(
            remove_not_found(bin_scores), ensure_ascii=False)
        evaluator_object.add_misclassified_examples = False

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)