예제 #1
0
    def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = False,
                 metric_name: str = 'MICRO_AVG') -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param eval_class_metrics: boolean indicating whether to print class metrics or not
        :param mini_batch_size: the mini batch size to use
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :param metric_name: the name of the metrics to compute
        :return: list of metrics, and the loss
        """
        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + mini_batch_size]
                for x in range(0, len(sentences), mini_batch_size)
            ]

            metric = Metric(metric_name)

            for batch in batches:
                scores = self.model.forward(batch)
                labels = self.model.obtain_labels(scores)
                loss = self.model.calculate_loss(scores, batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.tp()
                            if eval_class_metrics: metric.tp(prediction)
                        else:
                            metric.fp()
                            if eval_class_metrics: metric.fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.fn()
                            if eval_class_metrics: metric.fn(true_value)
                        else:
                            metric.tn()
                            if eval_class_metrics: metric.tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
예제 #2
0
    def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1',
                 embeddings_in_memory: bool = True):

        tp: int = 0
        fp: int = 0

        batch_no: int = 0
        mini_batch_size = 32
        batches = [evaluation[x:x + mini_batch_size] for x in
                   range(0, len(evaluation), mini_batch_size)]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            self.model.embeddings.embed(batch)

            for sentence in batch:

                sentence: Sentence = sentence

                # Step 3. Run our forward pass.
                score, tag_seq = self.model.predict_scores(sentence)

                # Step 5. Compute predictions
                predicted_id = tag_seq
                for (token, pred_id) in zip(sentence.tokens, predicted_id):
                    token: Token = token
                    # get the predicted tag
                    predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id)
                    token.add_tag('predicted', predicted_tag)

                    # get the gold tag
                    gold_tag = token.get_tag(self.model.tag_type)

                    # append both to file for evaluation
                    eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n"

                    # positives
                    if predicted_tag != '':
                        # true positives
                        if predicted_tag == gold_tag:
                            metric.tp()
                        # false positive
                        if predicted_tag != gold_tag:
                            metric.fp()

                    # negatives
                    if predicted_tag == '':
                        # true negative
                        if predicted_tag == gold_tag:
                            metric.tn()
                        # false negative
                        if predicted_tag != gold_tag:
                            metric.fn()

                    lines.append(eval_line)

                lines.append('\n')

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'span-F1':

            # get the eval script
            eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts')
            os.chmod(eval_script, 0o777)

            eval_data = ''.join(lines)

            p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8')
            main_result = p.stdout
            print(main_result)

            main_result = main_result.split('\n')[1]

            # parse the result file
            main_result = re.sub(';', ' ', main_result)
            main_result = re.sub('precision', 'p', main_result)
            main_result = re.sub('recall', 'r', main_result)
            main_result = re.sub('accuracy', 'acc', main_result)

            f_score = float(re.findall(r'\d+\.\d+$', main_result)[0])
            return f_score, metric._fp, main_result

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric._fp, str(score)

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric._fp, str(metric)
예제 #3
0
    def evaluate(self,
                 evaluation: List[Sentence],
                 out_path=None,
                 evaluation_method: str = 'F1',
                 eval_batch_size: int = 32,
                 embeddings_in_memory: bool = True):

        with torch.no_grad():
            batch_no: int = 0
            batches = [
                evaluation[x:x + eval_batch_size]
                for x in range(0, len(evaluation), eval_batch_size)
            ]

            metric = Metric('')

            lines: List[str] = []

            for batch in batches:
                batch_no += 1

                scores, tag_seq = self.model._predict_scores_batch(batch)
                predicted_ids = tag_seq
                all_tokens = []
                for sentence in batch:
                    all_tokens.extend(sentence.tokens)

                for (token, score,
                     predicted_id) in zip(all_tokens, scores, predicted_ids):
                    token: Token = token
                    # get the predicted tag
                    predicted_value = self.model.tag_dictionary.get_item_for_index(
                        predicted_id)
                    token.add_tag('predicted', predicted_value, score)

                for sentence in batch:

                    # add predicted tags
                    for token in sentence.tokens:
                        predicted_tag: Label = token.get_tag('predicted')

                        # append both to file for evaluation
                        eval_line = '{} {} {}\n'.format(
                            token.text,
                            token.get_tag(self.model.tag_type).value,
                            predicted_tag.value)

                        lines.append(eval_line)
                    lines.append('\n')

                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag))
                        for tag in sentence.get_spans(self.model.tag_type)
                    ]

                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag))
                        for tag in sentence.get_spans('predicted')
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.tp()
                            metric.tp(tag)
                        else:
                            metric.fp()
                            metric.fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.fn()
                            metric.fn(tag)
                        else:
                            metric.tn()
                            metric.tn(tag)

                if not embeddings_in_memory:
                    self.clear_embeddings_in_batch(batch)

            if out_path is not None:
                test_tsv = os.path.join(out_path, "test.tsv")
                with open(test_tsv, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            if evaluation_method == 'accuracy':
                score = metric.accuracy()
                return score, metric

            if evaluation_method == 'F1':
                score = metric.f_score()
                return score, metric