def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = False,
                 metric_name: str = 'MICRO_AVG') -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param eval_class_metrics: boolean indicating whether to print class metrics or not
        :param mini_batch_size: the mini batch size to use
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :param metric_name: the name of the metrics to compute
        :return: list of metrics, and the loss
        """
        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + mini_batch_size]
                for x in range(0, len(sentences), mini_batch_size)
            ]

            metric = Metric(metric_name)

            for batch in batches:
                scores = self.model.forward(batch)
                labels = self.model.obtain_labels(scores)
                loss = self.model.calculate_loss(scores, batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.tp()
                            if eval_class_metrics: metric.tp(prediction)
                        else:
                            metric.fp()
                            if eval_class_metrics: metric.fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.fn()
                            if eval_class_metrics: metric.fn(true_value)
                        else:
                            metric.tn()
                            if eval_class_metrics: metric.tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
示例#2
0
    def evaluate(self,
                 evaluation: List[Sentence],
                 out_path=None,
                 evaluation_method: str = 'F1',
                 eval_batch_size: int = 32,
                 embeddings_in_memory: bool = True):

        batch_no: int = 0
        batches = [
            evaluation[x:x + eval_batch_size]
            for x in range(0, len(evaluation), eval_batch_size)
        ]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            scores, tag_seq = self.model._predict_scores_batch(batch)
            predicted_ids = tag_seq
            all_tokens = []
            for sentence in batch:
                all_tokens.extend(sentence.tokens)

            for (token, score, predicted_id) in zip(all_tokens, scores,
                                                    predicted_ids):
                token: Token = token
                # get the predicted tag
                predicted_value = self.model.tag_dictionary.get_item_for_index(
                    predicted_id)
                token.add_tag('predicted', predicted_value, score)

            for sentence in batch:

                # add predicted tags
                for token in sentence.tokens:
                    predicted_tag: Label = token.get_tag('predicted')

                    # append both to file for evaluation
                    eval_line = '{} {} {}\n'.format(
                        token.text,
                        token.get_tag(self.model.tag_type).value,
                        predicted_tag.value)

                    lines.append(eval_line)
                lines.append('\n')

                # make list of gold tags
                gold_tags = [
                    str(tag) for tag in sentence.get_spans(self.model.tag_type)
                ]

                # make list of predicted tags
                predicted_tags = [
                    str(tag) for tag in sentence.get_spans('predicted')
                ]

                # check for true positives, false positives and false negatives
                for prediction in predicted_tags:
                    if prediction in gold_tags:
                        metric.tp()
                    else:
                        metric.fp()

                for gold in gold_tags:
                    if gold not in predicted_tags:
                        metric.fn()

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric
示例#3
0
    def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1',
                 embeddings_in_memory: bool = True):

        tp: int = 0
        fp: int = 0

        batch_no: int = 0
        mini_batch_size = 32
        batches = [evaluation[x:x + mini_batch_size] for x in
                   range(0, len(evaluation), mini_batch_size)]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            self.model.embeddings.embed(batch)

            for sentence in batch:

                sentence: Sentence = sentence

                # Step 3. Run our forward pass.
                score, tag_seq = self.model.predict_scores(sentence)

                # Step 5. Compute predictions
                predicted_id = tag_seq
                for (token, pred_id) in zip(sentence.tokens, predicted_id):
                    token: Token = token
                    # get the predicted tag
                    predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id)
                    token.add_tag('predicted', predicted_tag)

                    # get the gold tag
                    gold_tag = token.get_tag(self.model.tag_type)

                    # append both to file for evaluation
                    eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n"

                    # positives
                    if predicted_tag != '':
                        # true positives
                        if predicted_tag == gold_tag:
                            metric.tp()
                        # false positive
                        if predicted_tag != gold_tag:
                            metric.fp()

                    # negatives
                    if predicted_tag == '':
                        # true negative
                        if predicted_tag == gold_tag:
                            metric.tn()
                        # false negative
                        if predicted_tag != gold_tag:
                            metric.fn()

                    lines.append(eval_line)

                lines.append('\n')

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'span-F1':

            # get the eval script
            eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts')
            os.chmod(eval_script, 0o777)

            eval_data = ''.join(lines)

            p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8')
            main_result = p.stdout
            print(main_result)

            main_result = main_result.split('\n')[1]

            # parse the result file
            main_result = re.sub(';', ' ', main_result)
            main_result = re.sub('precision', 'p', main_result)
            main_result = re.sub('recall', 'r', main_result)
            main_result = re.sub('accuracy', 'acc', main_result)

            f_score = float(re.findall(r'\d+\.\d+$', main_result)[0])
            return f_score, metric._fp, main_result

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric._fp, str(score)

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric._fp, str(metric)