예제 #1
0
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    print(metric)

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6666666666666666
    assert metric.f_score("class-2") == 0.6666666666666666
    assert metric.f_score("class-3") == 0.18181818181818182
    assert metric.f_score("class-4") == 0.6666666666666666

    assert metric.accuracy("class-1") == 0.75
    assert metric.accuracy("class-2") == 0.75
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.75

    assert metric.micro_avg_f_score() == 0.21848739495798317
    assert metric.macro_avg_f_score() == 0.5454545454545454

    assert metric.micro_avg_accuracy() == 0.16964285714285715
    assert metric.macro_avg_accuracy() == 0.5875

    assert metric.precision() == 0.12264150943396226
    assert metric.recall() == 1
예제 #2
0
파일: test_utils.py 프로젝트: bluesea0/ditk
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6667
    assert metric.f_score("class-2") == 0.6667
    assert metric.f_score("class-3") == 0.1818
    assert metric.f_score("class-4") == 0.6667

    assert metric.accuracy("class-1") == 0.5
    assert metric.accuracy("class-2") == 0.5
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.5

    assert metric.micro_avg_f_score() == 0.2184
    assert metric.macro_avg_f_score() == 0.5454749999999999

    assert metric.micro_avg_accuracy() == 0.1226
    assert metric.macro_avg_accuracy() == 0.4

    assert metric.precision() == 0.1226
    assert metric.recall() == 1
예제 #3
0
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.4)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "none",
        eval_mode: EvalMode = EvalMode.Standard,
        misspell_mode: MisspellingMode = MisspellingMode.Random,
        misspelling_rate: float = 0.0,
        char_vocab: set = {},
        lut: dict = {},
        cmx: np.array = None,
        typos: dict = {},
        correction_mode: CorrectionMode = CorrectionMode.NotSpecified,
        eval_dict_name=None,
        evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        from robust_ner.spellcheck import load_correction_dict, get_lang_from_corpus_name

        if correction_mode == CorrectionMode.NotSpecified:
            eval_dict = None
        else:
            eval_dict = load_correction_dict(eval_dict_name, log)
            # note: use 'save_correction_dict' to re-generate a dictionary

        lang = get_lang_from_corpus_name(eval_dict_name)

        eval_params = {}
        eval_params["eval_mode"] = eval_mode
        eval_params["misspelling_rate"] = misspelling_rate
        eval_params["misspell_mode"] = misspell_mode
        eval_params["char_vocab"] = char_vocab
        eval_params["lut"] = lut
        eval_params["cmx"] = cmx
        eval_params["typos"] = typos
        eval_params["correction_mode"] = correction_mode
        eval_params["lang"] = lang
        eval_params["dictionary"] = eval_dict

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation")

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch, eval_params)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")

                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, tag.text)
                                 for tag in sentence.get_spans(self.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, tag.text)
                        for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            if evaluation_metric == EvaluationMetric.MICRO_F1_SCORE:
                main_score = metric.micro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
                main_score = metric.macro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
                main_score = metric.micro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
                main_score = metric.macro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MEAN_SQUARED_ERROR:
                main_score = metric.mean_squared_error()
            else:
                log.error(f"unknown evaluation metric: {evaluation_metric}")

            result = Result(
                main_score=main_score,
                log_line=
                f"{metric.precision():.4f}\t{metric.recall():.4f}\t{main_score:.4f}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
예제 #5
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         metric = Metric('Evaluation')
         lines = []
         batch_count = 0
         for batch in data_loader:
             batch_count += 1
             (labels, loss) = self.forward_labels_and_loss(batch)
             eval_loss += loss
             sentences_for_batch = [
                 sent.to_plain_string() for sent in batch
             ]
             confidences_for_batch = [[
                 label.score for label in sent_labels
             ] for sent_labels in labels]
             predictions_for_batch = [[
                 label.value for label in sent_labels
             ] for sent_labels in labels]
             true_values_for_batch = [
                 sentence.get_label_names() for sentence in batch
             ]
             available_labels = self.label_dictionary.get_items()
             for (sentence, confidence, prediction, true_value) in zip(
                     sentences_for_batch, confidences_for_batch,
                     predictions_for_batch, true_values_for_batch):
                 eval_line = '{}\t{}\t{}\t{}\n'.format(
                     sentence, true_value, prediction, confidence)
                 lines.append(eval_line)
             for (predictions_for_sentence,
                  true_values_for_sentence) in zip(predictions_for_batch,
                                                   true_values_for_batch):
                 for label in available_labels:
                     if ((label in predictions_for_sentence)
                             and (label in true_values_for_sentence)):
                         metric.add_tp(label)
                     elif ((label in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_fp(label)
                     elif ((label not in predictions_for_sentence)
                           and (label in true_values_for_sentence)):
                         metric.add_fn(label)
                     elif ((label not in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_tn(label)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_count
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         return (result, eval_loss)
예제 #6
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         batch_no = 0
         metric = Metric('Evaluation')
         lines = []
         for batch in data_loader:
             batch_no += 1
             with torch.no_grad():
                 features = self.forward(batch)
                 loss = self._calculate_loss(features, batch)
                 (tags, _) = self._obtain_labels(features, batch)
             eval_loss += loss
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label('predicted', tag)
                     eval_line = '{} {} {} {}\n'.format(
                         token.text,
                         token.get_tag(self.tag_type).value, tag.value,
                         tag.score)
                     lines.append(eval_line)
                 lines.append('\n')
             for sentence in batch:
                 gold_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans(self.tag_type)]
                 predicted_tags = [
                     (tag.tag, str(tag))
                     for tag in sentence.get_spans('predicted')
                 ]
                 for (tag, prediction) in predicted_tags:
                     if ((tag, prediction) in gold_tags):
                         metric.add_tp(tag)
                     else:
                         metric.add_fp(tag)
                 for (tag, gold) in gold_tags:
                     if ((tag, gold) not in predicted_tags):
                         metric.add_fn(tag)
                     else:
                         metric.add_tn(tag)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_no
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         return (result, eval_loss)