示例#1
0
    def test_epoch_end(self, outputs):
        mean_val_loss = sum(self.test_loss) / len(self.test_loss)
        gold, pred = [], []
        for y, y_hat in zip(self.test_y, self.test_y_hat):
            gold.append([self.bio2_id2tag[token_id] for token_id in y])
            pred.append([self.bio2_id2tag[token_id] for token_id in y_hat])

        evaluator = Evaluator(gold, pred, tags=self.tag_list, loader="list")

        results, results_by_tag = evaluator.evaluate()
        self.log("test/avg_loss", mean_val_loss, prog_bar=True)
        self.log("test/ent_type", results["ent_type"]["f1"])
        self.log("test/partial", results["partial"]["f1"])
        self.log("test/strict", results["strict"]["f1"])
        self.log("test/exact", results["exact"]["f1"])

        import pprint
        print("_" * 120)
        print("\n\n Test results: \n")
        pprint.pprint(results["strict"])
        print("\n Per class Strict-F1 values:")
        for cls in self.tag_list:
            print(f'\t {cls} : \t{results_by_tag[cls]["strict"]["f1"]:.3f}')

        self.test_y_hat = []
        self.test_y = []
        self.test_loss = []
def test_evaluator_list_non_matching_corpus_length():

    true = [['O', 'B-ORG', 'I-ORG', 'O', 'O'], ['O', 'O', 'O', 'O']]

    pred = [
        ['O', 'B-MISC', 'I-MISC', 'O'],
    ]

    with pytest.raises(ValueError):
        evaluator = Evaluator(true, pred, tags=['PER', 'MISC'], loader="list")
        evaluator.evaluate()
示例#3
0
    def score(self, y_true, y_pred, tags):
        """
        Evaluate the model's performance on the data
        for entity types in the 'tags' list, e.g. ['PERSON', 'LOC']
        """

        evaluator = Evaluator(y_true, y_pred, tags=tags)
        results, results_by_tag = evaluator.evaluate()

        score = {tag: results_by_tag[tag]["partial"]["f1"] for tag in tags}
        score["Overall"] = results["partial"]["f1"]
        return score
示例#4
0
def ner_classification_report(y_true, y_pred, groups, tags):
    """
    Evaluate the model's performance for each grouping of data
    for the NER labels given in 'tags'

    Input:
        y_pred: a list of predicted entities
        y_true: a list of gold entities
        groups: (str) the group each of the pred or gold entities belong to

    Output:
        report: evaluation metrics for each group
                in a nice format for printing
    """

    unique_groups = sorted(set(groups))
    outputs = []

    for group in unique_groups:
        pred_doc_entities = [
            y_pred[i] for i, g in enumerate(groups) if g == group
        ]
        true_doc_entities = [
            y_true[i] for i, g in enumerate(groups) if g == group
        ]

        evaluator = Evaluator(true_doc_entities, pred_doc_entities, tags=tags)
        results, _ = evaluator.evaluate()

        output_dict = {
            'precision (partial)': results['partial']['precision'],
            'recall (partial)': results['partial']['recall'],
            'f1-score': results['partial']['f1'],
            'support': len(pred_doc_entities)
        }
        output = [group]
        output.extend(list(output_dict.values()))
        outputs.append(output)

    headers = output_dict.keys()

    width = max(len(cn) for cn in unique_groups)
    head_fmt = '{:>{width}s} ' + ' {:>17}' * len(headers)
    report = head_fmt.format('', *headers, width=width)
    report += '\n\n'
    row_fmt = '{:>{width}s} ' + ' {:>17.{digits}f}' * 3 + ' {:>17}\n'

    for row in outputs:
        report += row_fmt.format(*row, width=width, digits=3)

    report += '\n'

    return report
def test_evaluator_list_wrong_prediction_length():

    true = [
        ['O', 'B-ORG', 'I-ORG', 'O', 'O'],
    ]

    pred = [
        ['O', 'B-MISC', 'I-MISC', 'O'],
    ]

    evaluator = Evaluator(true, pred, tags=['PER', 'MISC'], loader="list")

    with pytest.raises(ValueError):
        evaluator.evaluate()
示例#6
0
def print_report(label_list, true_file, predict_file, output_file):
    fout = open(output_file, 'w', encoding='utf-8')
    _, y_true = read_ner_file(true_file)
    _, y_pred = read_ner_file(predict_file)
    # print(y_true[0:3])
    # print(y_pred[0:3])
    print('List of label: ', label_list, file=fout)
    print('=========== Classification Report ===========', file=fout)
    print(classification_report(y_true, y_pred, digits=4), file=fout)
    evaluator = Evaluator(y_true, y_pred, tags=label_list, loader="list")
    results, results_by_tag = evaluator.evaluate()
    print('=========== Overall Results ===========', file=fout)
    print(results, file=fout)
    print('=========== Tags Detail Results ===========', file=fout)
    print(results_by_tag, file=fout)
示例#7
0
    def validation_epoch_end(self, outputs):
        print()
        mean_val_loss = sum(self.valid_loss) / len(self.valid_loss)
        gold, pred = [], []
        for y, y_hat in zip(self.valid_y, self.valid_y_hat):
            gold.append([self.bio2_id2tag[token_id] for token_id in y])
            pred.append([self.bio2_id2tag[token_id] for token_id in y_hat])

        evaluator = Evaluator(gold, pred, tags=self.tag_list, loader="list")

        results, results_by_tag = evaluator.evaluate()
        self.log("valid/avg_loss", mean_val_loss, prog_bar=True)
        self.log("valid/ent_type", results["ent_type"]["f1"])
        self.log("valid/partial", results["partial"]["f1"])
        self.log("valid/strict", results["strict"]["f1"])
        self.log("valid/exact", results["exact"]["f1"])

        self.valid_y_hat = []
        self.valid_y = []
        self.valid_loss = []
示例#8
0
    def training_epoch_end(self, outputs):
        mean_train_loss = sum(self.train_loss) / len(self.train_loss)
        gold = [
            self.label_encoder.inverse_transform(labels)
            for labels in self.train_y
        ]
        pred = [
            self.label_encoder.inverse_transform(labels)
            for labels in self.train_y_hat
        ]

        evaluator = Evaluator(gold, pred, tags=self.tags, loader="list")

        results, results_by_tag = evaluator.evaluate()
        self.log("train/avg_loss", mean_train_loss, prog_bar=True)
        self.log("train/ent_type", results["ent_type"]["f1"])
        self.log("train/partial", results["partial"]["f1"])
        self.log("train/strict", results["strict"]["f1"])
        self.log("train/exact", results["exact"]["f1"], prog_bar=True)

        self.train_y_hat = []
        self.train_y = []
        self.train_loss = []
示例#9
0
def main(input_folder,
         output_file="precision-et-rappel.csv",
         tagset=('LOC', 'PER', 'MISC')):
    actual = []
    predicted = []
    for input_file in pathlib.Path(input_folder).glob("*"):
        '''Passez en boucle dans le dossier "Prédictions"'''
        guess_annotation = []
        gold_annotation = []
        # load the predictions and gold standard tags
        with open(input_file, 'r', encoding='utf-8') as fin:
            for line in fin:
                line = line.strip()
                if not line:
                    continue
                try:
                    token, guess, gold, validity = line.split('\t')
                except ValueError:
                    print(line)
                    raise
                guess = guess.upper()
                # begin formatting the tags in the 'conll' format
                guess_annotation.append(f"{token}\t{guess}")
                gold_annotation.append(f"{token}\t{gold}")
        # finish 'conll' format
        actual.extend(gold_annotation)
        predicted.extend(guess_annotation)
    # generate precision and recall report
    evaluator = Evaluator('\n'.join(actual),
                          '\n'.join(predicted),
                          tags=tagset,
                          loader="conll")
    results, results_by_tag = evaluator.evaluate()

    # print to file
    with open(output_file, "w", encoding="utf-8") as fout:
        pretty_print(results, outfile=fout)
def test_evaluator_simple_case_filtered_tags():
    """
    Check that tags can be exluded by passing the tags argument

    """

    true = [[{
        "label": "PER",
        "start": 2,
        "end": 4
    }],
            [{
                "label": "LOC",
                "start": 1,
                "end": 2
            }, {
                "label": "LOC",
                "start": 3,
                "end": 4
            }]]

    pred = [[{
        "label": "PER",
        "start": 2,
        "end": 4
    }],
            [{
                "label": "LOC",
                "start": 1,
                "end": 2
            }, {
                "label": "LOC",
                "start": 3,
                "end": 4
            }]]

    evaluator = Evaluator(true, pred, tags=['PER', 'LOC'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
def test_evaluator_extra_classes():
    """
    Case when model predicts a class that is not in the gold (true) data
    """

    true = [
        [{
            "label": "ORG",
            "start": 1,
            "end": 3
        }],
    ]

    pred = [
        [{
            "label": "FOO",
            "start": 1,
            "end": 3
        }],
    ]

    evaluator = Evaluator(true, pred, tags=['ORG', 'FOO'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 0,
            'incorrect': 1,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 0,
            'recall': 0.0,
            'f1': 0
        },
        'ent_type': {
            'correct': 0,
            'incorrect': 1,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 0,
            'recall': 0.0,
            'f1': 0
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#12
0
def test_evaluator_simple_case():

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
        [
            {
                "label": "LOC",
                "start": 1,
                "end": 2
            },
            {
                "label": "LOC",
                "start": 3,
                "end": 4
            },
        ],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
        [
            {
                "label": "LOC",
                "start": 1,
                "end": 2
            },
            {
                "label": "LOC",
                "start": 3,
                "end": 4
            },
        ],
    ]

    evaluator = Evaluator(true, pred, tags=["LOC", "PER"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "ent_type": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "partial": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "exact": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
    }

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
示例#13
0
def test_evaluator_with_extra_keys_in_true():

    true = [
        [{
            "label": "PER",
            "start": 0,
            "end": 5,
            "token_start": 0,
            "token_end": 25
        }],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 0,
            "end": 5
        }],
    ]

    evaluator = Evaluator(true, pred, tags=["PER"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "ent_type": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "partial": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "exact": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
    }

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
示例#14
0
def test_evaluator_conll_no_entities_in_prediction():
    """
    Case when model predicts a class that is not in the gold (true) data
    """

    true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO"
    pred = "word\tO\nword\tO\nword\tO\nword\tO\nword\tO\nword\tO"

    evaluator = Evaluator(true, pred, tags=['PER'], loader="conll")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0.0,
        },
        'ent_type': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0.0,
        },
        'partial': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0.0,
        },
        'exact': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0.0,
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
def test_evaluator_list_no_entities_in_prediction():
    """
    Case when model predicts a class that is not in the gold (true) data
    """

    true = [
        ['O', 'O', 'B-PER', 'I-PER', 'O', 'O'],
    ]

    pred = [
        ['O', 'O', 'O', 'O', 'O', 'O'],
    ]

    evaluator = Evaluator(true, pred, tags=['PER'], loader="list")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'ent_type': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'partial': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'exact': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#16
0
def test_evaluator_simple_case_filtered_tags():
    """
    Check that tags can be exluded by passing the tags argument

    """

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
        [
            {
                "label": "LOC",
                "start": 1,
                "end": 2
            },
            {
                "label": "LOC",
                "start": 3,
                "end": 4
            },
        ],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
        [
            {
                "label": "LOC",
                "start": 1,
                "end": 2
            },
            {
                "label": "LOC",
                "start": 3,
                "end": 4
            },
        ],
    ]

    evaluator = Evaluator(true, pred, tags=["PER", "LOC"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "ent_type": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "partial": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
        "exact": {
            "correct": 3,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 3,
            "actual": 3,
            "precision": 1.0,
            "recall": 1.0,
            "f1": 1.0,
        },
    }

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
def test_evaluator_compare_results_and_results_agg_1():
    """
    Test case when model predicts a label not in the test data.
    """

    true = [
        [],
        [{
            "label": "ORG",
            "start": 2,
            "end": 4
        }],
        [{
            "label": "MISC",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [[{
        "label": "PER",
        "start": 2,
        "end": 4
    }], [{
        "label": "ORG",
        "start": 2,
        "end": 4
    }], [{
        "label": "MISC",
        "start": 2,
        "end": 4
    }]]

    evaluator = Evaluator(true, pred, tags=['PER', 'ORG', 'MISC'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 0.8
        },
        'ent_type': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 0.8
        },
        'partial': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 0.8
        },
        'exact': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 0.8
        }
    }

    expected_agg = {
        'ORG': {
            'strict': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'ent_type': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'partial': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'exact': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            }
        },
        'MISC': {
            'strict': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'ent_type': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'partial': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            },
            'exact': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 1,
                'possible': 1,
                'actual': 2,
                'precision': 0.5,
                'recall': 1,
                'f1': 0.6666666666666666
            }
        }
    }

    assert results_agg["ORG"]["strict"] == expected_agg["ORG"]["strict"]
    assert results_agg["ORG"]["ent_type"] == expected_agg["ORG"]["ent_type"]
    assert results_agg["ORG"]["partial"] == expected_agg["ORG"]["partial"]
    assert results_agg["ORG"]["exact"] == expected_agg["ORG"]["exact"]

    assert results_agg["MISC"]["strict"] == expected_agg["MISC"]["strict"]
    assert results_agg["MISC"]["ent_type"] == expected_agg["MISC"]["ent_type"]
    assert results_agg["MISC"]["partial"] == expected_agg["MISC"]["partial"]
    assert results_agg["MISC"]["exact"] == expected_agg["MISC"]["exact"]

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
def test_evaluator_list_simple_case_filtered_tags():
    """
    Check that tags can be exluded by passing the tags argument

    """

    true = [
        ['O', 'O', 'B-PER', 'I-PER', 'O'],
        ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'],
        ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O'],
    ]

    pred = [
        ['O', 'O', 'B-PER', 'I-PER', 'O'],
        ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'],
        ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O'],
    ]

    evaluator = Evaluator(true, pred, tags=['PER', 'LOC'], loader="list")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#19
0
def test_evaluator_compare_results_and_results_agg():
    """
    Check that the label level results match the total results.
    """

    true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO"
    pred = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO"

    evaluator = Evaluator(true, pred, tags=['PER'], loader="conll")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        }
    }

    expected_agg = {
        'PER': {
        'strict': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1.0,
        }
    }
    }

    assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"]
    assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"]
    assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"]
    assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"]

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']

    assert results['strict'] == expected_agg['PER']['strict']
    assert results['ent_type'] == expected_agg['PER']['ent_type']
    assert results['partial'] == expected_agg['PER']['partial']
    assert results['exact'] == expected_agg['PER']['exact']
def test_evaluator_list_simple_case():

    true = [
        ['O', 'O', 'B-PER', 'I-PER', 'O'],
        ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'],
    ]

    pred = [
        ['O', 'O', 'B-PER', 'I-PER', 'O'],
        ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'],
    ]

    evaluator = Evaluator(true, pred, tags=['LOC', 'PER'], loader="list")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        }
    }
    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#21
0
def test_evaluator_compare_results_and_results_agg_1():
    """
    Test case when model predicts a label not in the test data.
    """

    true = (
        "word\tO\nword\tO\nword\tO\nword\tO\nword\tO\nword\tO\n\n"
        "word\tO\nword\tO\nword\tB-ORG \nword\tI-ORG \nword\tO\nword\tO\n\n"
        "word\tO\nword\tO\nword\tB-MISC\nword\tI-MISC\nword\tO\nword\tO\n\n"
    )

    pred = (
        "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO\n\n"
        "word\tO\nword\tO\nword\tB-ORG \nword\tI-ORG \nword\tO\nword\tO\n\n"
        "word\tO\nword\tO\nword\tB-MISC\nword\tI-MISC\nword\tO\nword\tO\n\n"
    )

    evaluator = Evaluator(true, pred, tags=['PER', 'ORG', 'MISC'], loader="conll")
    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 1.0,
        },
        'partial': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 1.0,
        },
        'exact': {
            'correct': 2,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 2,
            'actual': 3,
            'precision': 0.6666666666666666,
            'recall': 1.0,
            'f1': 1.0,
        }
    }

    expected_agg = {
        'ORG': {
        'strict': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        }
    },
        'MISC': {
        'strict': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 1,
            'possible': 1,
            'actual': 2,
            'precision': 0.5,
            'recall': 1,
            'f1': 1.0,
        }
    }
    }
示例#22
0
def test_evaluator_conll_simple_case_filtered_tags():
    """
    Check that tags can be exluded by passing the tags argument

    """

    true = "word\tO\nword\tO\B-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n"

    pred = "word\tO\nword\tO\B-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n"

    evaluator = Evaluator(true, pred, tags=['PER', 'LOC'], loader="conll")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
def test_evaluator_no_entities_in_prediction():
    """
    Case when model predicts a class that is not in the gold (true) data
    """

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [
        [],
    ]

    evaluator = Evaluator(true, pred, tags=['PER'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'ent_type': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'partial': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'exact': {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 1,
            'spurious': 0,
            'possible': 1,
            'actual': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#24
0
def test_evaluator_no_entities_in_prediction():
    """
    Case when model predicts a class that is not in the gold (true) data
    """

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [
        [],
    ]

    evaluator = Evaluator(true, pred, tags=["PER"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 0,
            "incorrect": 0,
            "partial": 0,
            "missed": 1,
            "spurious": 0,
            "possible": 1,
            "actual": 0,
            "precision": 0,
            "recall": 0,
            "f1": 0,
        },
        "ent_type": {
            "correct": 0,
            "incorrect": 0,
            "partial": 0,
            "missed": 1,
            "spurious": 0,
            "possible": 1,
            "actual": 0,
            "precision": 0,
            "recall": 0,
            "f1": 0,
        },
        "partial": {
            "correct": 0,
            "incorrect": 0,
            "partial": 0,
            "missed": 1,
            "spurious": 0,
            "possible": 1,
            "actual": 0,
            "precision": 0,
            "recall": 0,
            "f1": 0,
        },
        "exact": {
            "correct": 0,
            "incorrect": 0,
            "partial": 0,
            "missed": 1,
            "spurious": 0,
            "possible": 1,
            "actual": 0,
            "precision": 0,
            "recall": 0,
            "f1": 0,
        },
    }

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
def test_evaluator_compare_results_and_results_agg():
    """
    Check that the label level results match the total results.
    """

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    evaluator = Evaluator(true, pred, tags=['PER'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1
        },
        'ent_type': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1
        },
        'partial': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1
        },
        'exact': {
            'correct': 1,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 1,
            'actual': 1,
            'precision': 1,
            'recall': 1,
            'f1': 1
        }
    }

    expected_agg = {
        'PER': {
            'strict': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 0,
                'possible': 1,
                'actual': 1,
                'precision': 1,
                'recall': 1,
                'f1': 1
            },
            'ent_type': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 0,
                'possible': 1,
                'actual': 1,
                'precision': 1,
                'recall': 1,
                'f1': 1
            },
            'partial': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 0,
                'possible': 1,
                'actual': 1,
                'precision': 1,
                'recall': 1,
                'f1': 1
            },
            'exact': {
                'correct': 1,
                'incorrect': 0,
                'partial': 0,
                'missed': 0,
                'spurious': 0,
                'possible': 1,
                'actual': 1,
                'precision': 1,
                'recall': 1,
                'f1': 1
            }
        }
    }

    assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"]
    assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"]
    assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"]
    assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"]

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']

    assert results['strict'] == expected_agg['PER']['strict']
    assert results['ent_type'] == expected_agg['PER']['ent_type']
    assert results['partial'] == expected_agg['PER']['partial']
    assert results['exact'] == expected_agg['PER']['exact']
示例#26
0
def test_evaluator_compare_results_and_results_agg():
    """
    Check that the label level results match the total results.
    """

    true = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
    ]

    evaluator = Evaluator(true, pred, tags=["PER"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1,
            "recall": 1,
            "f1": 1,
        },
        "ent_type": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1,
            "recall": 1,
            "f1": 1,
        },
        "partial": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1,
            "recall": 1,
            "f1": 1,
        },
        "exact": {
            "correct": 1,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 0,
            "possible": 1,
            "actual": 1,
            "precision": 1,
            "recall": 1,
            "f1": 1,
        },
    }

    expected_agg = {
        "PER": {
            "strict": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "ent_type": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "partial": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "exact": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
        }
    }

    assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"]
    assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"]
    assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"]
    assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"]

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]

    assert results["strict"] == expected_agg["PER"]["strict"]
    assert results["ent_type"] == expected_agg["PER"]["ent_type"]
    assert results["partial"] == expected_agg["PER"]["partial"]
    assert results["exact"] == expected_agg["PER"]["exact"]
def test_evaluator_with_extra_keys_in_true():

    true = [[{
        "label": "PER",
        "start": 2,
        "end": 4,
        "token_start": 0,
        "token_end": 4
    }],
            [{
                "label": "LOC",
                "start": 1,
                "end": 2,
                "token_start": 0,
                "token_end": 5
            }, {
                "label": "LOC",
                "start": 3,
                "end": 4,
                "token_start": 7,
                "token_end": 9
            }]]

    pred = [[{
        "label": "PER",
        "start": 2,
        "end": 4
    }],
            [{
                "label": "LOC",
                "start": 1,
                "end": 2
            }, {
                "label": "LOC",
                "start": 3,
                "end": 4
            }]]

    evaluator = Evaluator(true, pred, tags=['LOC', 'PER'])

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']
示例#28
0
def test_evaluator_compare_results_and_results_agg_1():
    """
    Test case when model predicts a label not in the test data.
    """

    true = [
        [],
        [{
            "label": "ORG",
            "start": 2,
            "end": 4
        }],
        [{
            "label": "MISC",
            "start": 2,
            "end": 4
        }],
    ]

    pred = [
        [{
            "label": "PER",
            "start": 2,
            "end": 4
        }],
        [{
            "label": "ORG",
            "start": 2,
            "end": 4
        }],
        [{
            "label": "MISC",
            "start": 2,
            "end": 4
        }],
    ]

    evaluator = Evaluator(true, pred, tags=["PER", "ORG", "MISC"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 2,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.6666666666666666,
            "recall": 1.0,
            "f1": 0.8,
        },
        "ent_type": {
            "correct": 2,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.6666666666666666,
            "recall": 1.0,
            "f1": 0.8,
        },
        "partial": {
            "correct": 2,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.6666666666666666,
            "recall": 1.0,
            "f1": 0.8,
        },
        "exact": {
            "correct": 2,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.6666666666666666,
            "recall": 1.0,
            "f1": 0.8,
        },
    }

    expected_agg = {
        "ORG": {
            "strict": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1.0,
                "recall": 1,
                "f1": 1.0,
            },
            "ent_type": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1.0,
                "recall": 1,
                "f1": 1.0,
            },
            "partial": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1.0,
            },
            "exact": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
        },
        "MISC": {
            "strict": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "ent_type": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "partial": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
            "exact": {
                "correct": 1,
                "incorrect": 0,
                "partial": 0,
                "missed": 0,
                "spurious": 0,
                "possible": 1,
                "actual": 1,
                "precision": 1,
                "recall": 1,
                "f1": 1,
            },
        },
    }

    assert results_agg["ORG"]["strict"] == expected_agg["ORG"]["strict"]
    assert results_agg["ORG"]["ent_type"] == expected_agg["ORG"]["ent_type"]
    assert results_agg["ORG"]["partial"] == expected_agg["ORG"]["partial"]
    assert results_agg["ORG"]["exact"] == expected_agg["ORG"]["exact"]

    assert results_agg["MISC"]["strict"] == expected_agg["MISC"]["strict"]
    assert results_agg["MISC"]["ent_type"] == expected_agg["MISC"]["ent_type"]
    assert results_agg["MISC"]["partial"] == expected_agg["MISC"]["partial"]
    assert results_agg["MISC"]["exact"] == expected_agg["MISC"]["exact"]

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
示例#29
0
def test_issue_29():

    true = [[
        {
            "label": "PER",
            "start": 1,
            "end": 2
        },
        {
            "label": "PER",
            "start": 3,
            "end": 10
        },
    ]]

    pred = [[
        {
            "label": "PER",
            "start": 1,
            "end": 2
        },
        {
            "label": "PER",
            "start": 3,
            "end": 5
        },
        {
            "label": "PER",
            "start": 6,
            "end": 10
        },
    ]]

    evaluator = Evaluator(true, pred, tags=["PER"])

    results, results_agg = evaluator.evaluate()

    expected = {
        "strict": {
            "correct": 1,
            "incorrect": 1,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.3333333333333333,
            "recall": 0.5,
            "f1": 0.4,
        },
        "ent_type": {
            "correct": 2,
            "incorrect": 0,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.6666666666666666,
            "recall": 1.0,
            "f1": 0.8,
        },
        "partial": {
            "correct": 1,
            "incorrect": 0,
            "partial": 1,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.5,
            "recall": 0.75,
            "f1": 0.6,
        },
        "exact": {
            "correct": 1,
            "incorrect": 1,
            "partial": 0,
            "missed": 0,
            "spurious": 1,
            "possible": 2,
            "actual": 3,
            "precision": 0.3333333333333333,
            "recall": 0.5,
            "f1": 0.4,
        },
    }

    assert results["strict"] == expected["strict"]
    assert results["ent_type"] == expected["ent_type"]
    assert results["partial"] == expected["partial"]
    assert results["exact"] == expected["exact"]
示例#30
0
def test_evaluator_simple_case():

    true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n"

    pred = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n"

    evaluator = Evaluator(true, pred, tags=['LOC', 'PER'], loader="conll")

    results, results_agg = evaluator.evaluate()

    expected = {
        'strict': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'ent_type': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'partial': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        },
        'exact': {
            'correct': 3,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 3,
            'actual': 3,
            'precision': 1.0,
            'recall': 1.0,
            'f1': 1.0,
        }
    }

    assert results['strict'] == expected['strict']
    assert results['ent_type'] == expected['ent_type']
    assert results['partial'] == expected['partial']
    assert results['exact'] == expected['exact']