def test_evaluator_list_non_matching_corpus_length(): true = [['O', 'B-ORG', 'I-ORG', 'O', 'O'], ['O', 'O', 'O', 'O']] pred = [ ['O', 'B-MISC', 'I-MISC', 'O'], ] with pytest.raises(ValueError): evaluator = Evaluator(true, pred, tags=['PER', 'MISC'], loader="list") evaluator.evaluate()
def test_evaluator_list_wrong_prediction_length(): true = [ ['O', 'B-ORG', 'I-ORG', 'O', 'O'], ] pred = [ ['O', 'B-MISC', 'I-MISC', 'O'], ] evaluator = Evaluator(true, pred, tags=['PER', 'MISC'], loader="list") with pytest.raises(ValueError): evaluator.evaluate()
def test_epoch_end(self, outputs): mean_val_loss = sum(self.test_loss) / len(self.test_loss) gold, pred = [], [] for y, y_hat in zip(self.test_y, self.test_y_hat): gold.append([self.bio2_id2tag[token_id] for token_id in y]) pred.append([self.bio2_id2tag[token_id] for token_id in y_hat]) evaluator = Evaluator(gold, pred, tags=self.tag_list, loader="list") results, results_by_tag = evaluator.evaluate() self.log("test/avg_loss", mean_val_loss, prog_bar=True) self.log("test/ent_type", results["ent_type"]["f1"]) self.log("test/partial", results["partial"]["f1"]) self.log("test/strict", results["strict"]["f1"]) self.log("test/exact", results["exact"]["f1"]) import pprint print("_" * 120) print("\n\n Test results: \n") pprint.pprint(results["strict"]) print("\n Per class Strict-F1 values:") for cls in self.tag_list: print(f'\t {cls} : \t{results_by_tag[cls]["strict"]["f1"]:.3f}') self.test_y_hat = [] self.test_y = [] self.test_loss = []
def score(self, y_true, y_pred, tags): """ Evaluate the model's performance on the data for entity types in the 'tags' list, e.g. ['PERSON', 'LOC'] """ evaluator = Evaluator(y_true, y_pred, tags=tags) results, results_by_tag = evaluator.evaluate() score = {tag: results_by_tag[tag]["partial"]["f1"] for tag in tags} score["Overall"] = results["partial"]["f1"] return score
def ner_classification_report(y_true, y_pred, groups, tags): """ Evaluate the model's performance for each grouping of data for the NER labels given in 'tags' Input: y_pred: a list of predicted entities y_true: a list of gold entities groups: (str) the group each of the pred or gold entities belong to Output: report: evaluation metrics for each group in a nice format for printing """ unique_groups = sorted(set(groups)) outputs = [] for group in unique_groups: pred_doc_entities = [ y_pred[i] for i, g in enumerate(groups) if g == group ] true_doc_entities = [ y_true[i] for i, g in enumerate(groups) if g == group ] evaluator = Evaluator(true_doc_entities, pred_doc_entities, tags=tags) results, _ = evaluator.evaluate() output_dict = { 'precision (partial)': results['partial']['precision'], 'recall (partial)': results['partial']['recall'], 'f1-score': results['partial']['f1'], 'support': len(pred_doc_entities) } output = [group] output.extend(list(output_dict.values())) outputs.append(output) headers = output_dict.keys() width = max(len(cn) for cn in unique_groups) head_fmt = '{:>{width}s} ' + ' {:>17}' * len(headers) report = head_fmt.format('', *headers, width=width) report += '\n\n' row_fmt = '{:>{width}s} ' + ' {:>17.{digits}f}' * 3 + ' {:>17}\n' for row in outputs: report += row_fmt.format(*row, width=width, digits=3) report += '\n' return report
def print_report(label_list, true_file, predict_file, output_file): fout = open(output_file, 'w', encoding='utf-8') _, y_true = read_ner_file(true_file) _, y_pred = read_ner_file(predict_file) # print(y_true[0:3]) # print(y_pred[0:3]) print('List of label: ', label_list, file=fout) print('=========== Classification Report ===========', file=fout) print(classification_report(y_true, y_pred, digits=4), file=fout) evaluator = Evaluator(y_true, y_pred, tags=label_list, loader="list") results, results_by_tag = evaluator.evaluate() print('=========== Overall Results ===========', file=fout) print(results, file=fout) print('=========== Tags Detail Results ===========', file=fout) print(results_by_tag, file=fout)
def validation_epoch_end(self, outputs): print() mean_val_loss = sum(self.valid_loss) / len(self.valid_loss) gold, pred = [], [] for y, y_hat in zip(self.valid_y, self.valid_y_hat): gold.append([self.bio2_id2tag[token_id] for token_id in y]) pred.append([self.bio2_id2tag[token_id] for token_id in y_hat]) evaluator = Evaluator(gold, pred, tags=self.tag_list, loader="list") results, results_by_tag = evaluator.evaluate() self.log("valid/avg_loss", mean_val_loss, prog_bar=True) self.log("valid/ent_type", results["ent_type"]["f1"]) self.log("valid/partial", results["partial"]["f1"]) self.log("valid/strict", results["strict"]["f1"]) self.log("valid/exact", results["exact"]["f1"]) self.valid_y_hat = [] self.valid_y = [] self.valid_loss = []
def training_epoch_end(self, outputs): mean_train_loss = sum(self.train_loss) / len(self.train_loss) gold = [ self.label_encoder.inverse_transform(labels) for labels in self.train_y ] pred = [ self.label_encoder.inverse_transform(labels) for labels in self.train_y_hat ] evaluator = Evaluator(gold, pred, tags=self.tags, loader="list") results, results_by_tag = evaluator.evaluate() self.log("train/avg_loss", mean_train_loss, prog_bar=True) self.log("train/ent_type", results["ent_type"]["f1"]) self.log("train/partial", results["partial"]["f1"]) self.log("train/strict", results["strict"]["f1"]) self.log("train/exact", results["exact"]["f1"], prog_bar=True) self.train_y_hat = [] self.train_y = [] self.train_loss = []
def main(input_folder, output_file="precision-et-rappel.csv", tagset=('LOC', 'PER', 'MISC')): actual = [] predicted = [] for input_file in pathlib.Path(input_folder).glob("*"): '''Passez en boucle dans le dossier "Prédictions"''' guess_annotation = [] gold_annotation = [] # load the predictions and gold standard tags with open(input_file, 'r', encoding='utf-8') as fin: for line in fin: line = line.strip() if not line: continue try: token, guess, gold, validity = line.split('\t') except ValueError: print(line) raise guess = guess.upper() # begin formatting the tags in the 'conll' format guess_annotation.append(f"{token}\t{guess}") gold_annotation.append(f"{token}\t{gold}") # finish 'conll' format actual.extend(gold_annotation) predicted.extend(guess_annotation) # generate precision and recall report evaluator = Evaluator('\n'.join(actual), '\n'.join(predicted), tags=tagset, loader="conll") results, results_by_tag = evaluator.evaluate() # print to file with open(output_file, "w", encoding="utf-8") as fout: pretty_print(results, outfile=fout)
def test_evaluator_simple_case_filtered_tags(): """ Check that tags can be exluded by passing the tags argument """ true = [[{ "label": "PER", "start": 2, "end": 4 }], [{ "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }]] pred = [[{ "label": "PER", "start": 2, "end": 4 }], [{ "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }]] evaluator = Evaluator(true, pred, tags=['PER', 'LOC']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_extra_classes(): """ Case when model predicts a class that is not in the gold (true) data """ true = [ [{ "label": "ORG", "start": 1, "end": 3 }], ] pred = [ [{ "label": "FOO", "start": 1, "end": 3 }], ] evaluator = Evaluator(true, pred, tags=['ORG', 'FOO']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 0, 'incorrect': 1, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 0, 'recall': 0.0, 'f1': 0 }, 'ent_type': { 'correct': 0, 'incorrect': 1, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 0, 'recall': 0.0, 'f1': 0 }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_simple_case(): true = [ [{ "label": "PER", "start": 2, "end": 4 }], [ { "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }, ], ] pred = [ [{ "label": "PER", "start": 2, "end": 4 }], [ { "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }, ], ] evaluator = Evaluator(true, pred, tags=["LOC", "PER"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "ent_type": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "partial": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "exact": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, } assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_evaluator_with_extra_keys_in_true(): true = [ [{ "label": "PER", "start": 0, "end": 5, "token_start": 0, "token_end": 25 }], ] pred = [ [{ "label": "PER", "start": 0, "end": 5 }], ] evaluator = Evaluator(true, pred, tags=["PER"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "ent_type": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "partial": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "exact": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, } assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_evaluator_conll_no_entities_in_prediction(): """ Case when model predicts a class that is not in the gold (true) data """ true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO" pred = "word\tO\nword\tO\nword\tO\nword\tO\nword\tO\nword\tO" evaluator = Evaluator(true, pred, tags=['PER'], loader="conll") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0.0, }, 'ent_type': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0.0, }, 'partial': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0.0, }, 'exact': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0.0, } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_list_no_entities_in_prediction(): """ Case when model predicts a class that is not in the gold (true) data """ true = [ ['O', 'O', 'B-PER', 'I-PER', 'O', 'O'], ] pred = [ ['O', 'O', 'O', 'O', 'O', 'O'], ] evaluator = Evaluator(true, pred, tags=['PER'], loader="list") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'ent_type': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'partial': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'exact': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_simple_case_filtered_tags(): """ Check that tags can be exluded by passing the tags argument """ true = [ [{ "label": "PER", "start": 2, "end": 4 }], [ { "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }, ], ] pred = [ [{ "label": "PER", "start": 2, "end": 4 }], [ { "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }, ], ] evaluator = Evaluator(true, pred, tags=["PER", "LOC"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "ent_type": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "partial": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, "exact": { "correct": 3, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 3, "actual": 3, "precision": 1.0, "recall": 1.0, "f1": 1.0, }, } assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_evaluator_compare_results_and_results_agg_1(): """ Test case when model predicts a label not in the test data. """ true = [ [], [{ "label": "ORG", "start": 2, "end": 4 }], [{ "label": "MISC", "start": 2, "end": 4 }], ] pred = [[{ "label": "PER", "start": 2, "end": 4 }], [{ "label": "ORG", "start": 2, "end": 4 }], [{ "label": "MISC", "start": 2, "end": 4 }]] evaluator = Evaluator(true, pred, tags=['PER', 'ORG', 'MISC']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 0.8 }, 'ent_type': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 0.8 }, 'partial': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 0.8 }, 'exact': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 0.8 } } expected_agg = { 'ORG': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 } }, 'MISC': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 0.6666666666666666 } } } assert results_agg["ORG"]["strict"] == expected_agg["ORG"]["strict"] assert results_agg["ORG"]["ent_type"] == expected_agg["ORG"]["ent_type"] assert results_agg["ORG"]["partial"] == expected_agg["ORG"]["partial"] assert results_agg["ORG"]["exact"] == expected_agg["ORG"]["exact"] assert results_agg["MISC"]["strict"] == expected_agg["MISC"]["strict"] assert results_agg["MISC"]["ent_type"] == expected_agg["MISC"]["ent_type"] assert results_agg["MISC"]["partial"] == expected_agg["MISC"]["partial"] assert results_agg["MISC"]["exact"] == expected_agg["MISC"]["exact"] assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_list_simple_case_filtered_tags(): """ Check that tags can be exluded by passing the tags argument """ true = [ ['O', 'O', 'B-PER', 'I-PER', 'O'], ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'], ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O'], ] pred = [ ['O', 'O', 'B-PER', 'I-PER', 'O'], ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'], ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O'], ] evaluator = Evaluator(true, pred, tags=['PER', 'LOC'], loader="list") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_compare_results_and_results_agg(): """ Check that the label level results match the total results. """ true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO" pred = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO" evaluator = Evaluator(true, pred, tags=['PER'], loader="conll") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, } } expected_agg = { 'PER': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1.0, } } } assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"] assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"] assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"] assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"] assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact'] assert results['strict'] == expected_agg['PER']['strict'] assert results['ent_type'] == expected_agg['PER']['ent_type'] assert results['partial'] == expected_agg['PER']['partial'] assert results['exact'] == expected_agg['PER']['exact']
def test_evaluator_list_simple_case(): true = [ ['O', 'O', 'B-PER', 'I-PER', 'O'], ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'], ] pred = [ ['O', 'O', 'B-PER', 'I-PER', 'O'], ['O', 'B-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'O'], ] evaluator = Evaluator(true, pred, tags=['LOC', 'PER'], loader="list") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_compare_results_and_results_agg_1(): """ Test case when model predicts a label not in the test data. """ true = ( "word\tO\nword\tO\nword\tO\nword\tO\nword\tO\nword\tO\n\n" "word\tO\nword\tO\nword\tB-ORG \nword\tI-ORG \nword\tO\nword\tO\n\n" "word\tO\nword\tO\nword\tB-MISC\nword\tI-MISC\nword\tO\nword\tO\n\n" ) pred = ( "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\nword\tO\n\n" "word\tO\nword\tO\nword\tB-ORG \nword\tI-ORG \nword\tO\nword\tO\n\n" "word\tO\nword\tO\nword\tB-MISC\nword\tI-MISC\nword\tO\nword\tO\n\n" ) evaluator = Evaluator(true, pred, tags=['PER', 'ORG', 'MISC'], loader="conll") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 1.0, }, 'ent_type': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 1.0, }, 'partial': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 1.0, }, 'exact': { 'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 2, 'actual': 3, 'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 1.0, } } expected_agg = { 'ORG': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, } }, 'MISC': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 1, 'actual': 2, 'precision': 0.5, 'recall': 1, 'f1': 1.0, } } }
def test_evaluator_conll_simple_case_filtered_tags(): """ Check that tags can be exluded by passing the tags argument """ true = "word\tO\nword\tO\B-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n" pred = "word\tO\nword\tO\B-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n" evaluator = Evaluator(true, pred, tags=['PER', 'LOC'], loader="conll") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_no_entities_in_prediction(): """ Case when model predicts a class that is not in the gold (true) data """ true = [ [{ "label": "PER", "start": 2, "end": 4 }], ] pred = [ [], ] evaluator = Evaluator(true, pred, tags=['PER']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'ent_type': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'partial': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'exact': { 'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 0, 'possible': 1, 'actual': 0, 'precision': 0, 'recall': 0, 'f1': 0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_no_entities_in_prediction(): """ Case when model predicts a class that is not in the gold (true) data """ true = [ [{ "label": "PER", "start": 2, "end": 4 }], ] pred = [ [], ] evaluator = Evaluator(true, pred, tags=["PER"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 0, "incorrect": 0, "partial": 0, "missed": 1, "spurious": 0, "possible": 1, "actual": 0, "precision": 0, "recall": 0, "f1": 0, }, "ent_type": { "correct": 0, "incorrect": 0, "partial": 0, "missed": 1, "spurious": 0, "possible": 1, "actual": 0, "precision": 0, "recall": 0, "f1": 0, }, "partial": { "correct": 0, "incorrect": 0, "partial": 0, "missed": 1, "spurious": 0, "possible": 1, "actual": 0, "precision": 0, "recall": 0, "f1": 0, }, "exact": { "correct": 0, "incorrect": 0, "partial": 0, "missed": 1, "spurious": 0, "possible": 1, "actual": 0, "precision": 0, "recall": 0, "f1": 0, }, } assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_evaluator_compare_results_and_results_agg(): """ Check that the label level results match the total results. """ true = [ [{ "label": "PER", "start": 2, "end": 4 }], ] pred = [ [{ "label": "PER", "start": 2, "end": 4 }], ] evaluator = Evaluator(true, pred, tags=['PER']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 } } expected_agg = { 'PER': { 'strict': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'ent_type': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'partial': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 }, 'exact': { 'correct': 1, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 1, 'actual': 1, 'precision': 1, 'recall': 1, 'f1': 1 } } } assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"] assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"] assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"] assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"] assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact'] assert results['strict'] == expected_agg['PER']['strict'] assert results['ent_type'] == expected_agg['PER']['ent_type'] assert results['partial'] == expected_agg['PER']['partial'] assert results['exact'] == expected_agg['PER']['exact']
def test_evaluator_compare_results_and_results_agg(): """ Check that the label level results match the total results. """ true = [ [{ "label": "PER", "start": 2, "end": 4 }], ] pred = [ [{ "label": "PER", "start": 2, "end": 4 }], ] evaluator = Evaluator(true, pred, tags=["PER"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "ent_type": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "partial": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "exact": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, } expected_agg = { "PER": { "strict": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "ent_type": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "partial": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "exact": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, } } assert results_agg["PER"]["strict"] == expected_agg["PER"]["strict"] assert results_agg["PER"]["ent_type"] == expected_agg["PER"]["ent_type"] assert results_agg["PER"]["partial"] == expected_agg["PER"]["partial"] assert results_agg["PER"]["exact"] == expected_agg["PER"]["exact"] assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"] assert results["strict"] == expected_agg["PER"]["strict"] assert results["ent_type"] == expected_agg["PER"]["ent_type"] assert results["partial"] == expected_agg["PER"]["partial"] assert results["exact"] == expected_agg["PER"]["exact"]
def test_evaluator_with_extra_keys_in_true(): true = [[{ "label": "PER", "start": 2, "end": 4, "token_start": 0, "token_end": 4 }], [{ "label": "LOC", "start": 1, "end": 2, "token_start": 0, "token_end": 5 }, { "label": "LOC", "start": 3, "end": 4, "token_start": 7, "token_end": 9 }]] pred = [[{ "label": "PER", "start": 2, "end": 4 }], [{ "label": "LOC", "start": 1, "end": 2 }, { "label": "LOC", "start": 3, "end": 4 }]] evaluator = Evaluator(true, pred, tags=['LOC', 'PER']) results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0 } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']
def test_evaluator_compare_results_and_results_agg_1(): """ Test case when model predicts a label not in the test data. """ true = [ [], [{ "label": "ORG", "start": 2, "end": 4 }], [{ "label": "MISC", "start": 2, "end": 4 }], ] pred = [ [{ "label": "PER", "start": 2, "end": 4 }], [{ "label": "ORG", "start": 2, "end": 4 }], [{ "label": "MISC", "start": 2, "end": 4 }], ] evaluator = Evaluator(true, pred, tags=["PER", "ORG", "MISC"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 2, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8, }, "ent_type": { "correct": 2, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8, }, "partial": { "correct": 2, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8, }, "exact": { "correct": 2, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8, }, } expected_agg = { "ORG": { "strict": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1, "f1": 1.0, }, "ent_type": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1.0, "recall": 1, "f1": 1.0, }, "partial": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1.0, }, "exact": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, }, "MISC": { "strict": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "ent_type": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "partial": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, "exact": { "correct": 1, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 1, "actual": 1, "precision": 1, "recall": 1, "f1": 1, }, }, } assert results_agg["ORG"]["strict"] == expected_agg["ORG"]["strict"] assert results_agg["ORG"]["ent_type"] == expected_agg["ORG"]["ent_type"] assert results_agg["ORG"]["partial"] == expected_agg["ORG"]["partial"] assert results_agg["ORG"]["exact"] == expected_agg["ORG"]["exact"] assert results_agg["MISC"]["strict"] == expected_agg["MISC"]["strict"] assert results_agg["MISC"]["ent_type"] == expected_agg["MISC"]["ent_type"] assert results_agg["MISC"]["partial"] == expected_agg["MISC"]["partial"] assert results_agg["MISC"]["exact"] == expected_agg["MISC"]["exact"] assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_issue_29(): true = [[ { "label": "PER", "start": 1, "end": 2 }, { "label": "PER", "start": 3, "end": 10 }, ]] pred = [[ { "label": "PER", "start": 1, "end": 2 }, { "label": "PER", "start": 3, "end": 5 }, { "label": "PER", "start": 6, "end": 10 }, ]] evaluator = Evaluator(true, pred, tags=["PER"]) results, results_agg = evaluator.evaluate() expected = { "strict": { "correct": 1, "incorrect": 1, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.3333333333333333, "recall": 0.5, "f1": 0.4, }, "ent_type": { "correct": 2, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8, }, "partial": { "correct": 1, "incorrect": 0, "partial": 1, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.5, "recall": 0.75, "f1": 0.6, }, "exact": { "correct": 1, "incorrect": 1, "partial": 0, "missed": 0, "spurious": 1, "possible": 2, "actual": 3, "precision": 0.3333333333333333, "recall": 0.5, "f1": 0.4, }, } assert results["strict"] == expected["strict"] assert results["ent_type"] == expected["ent_type"] assert results["partial"] == expected["partial"] assert results["exact"] == expected["exact"]
def test_evaluator_simple_case(): true = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n" pred = "word\tO\nword\tO\nword\tB-PER\nword\tI-PER\nword\tO\n\nword\tO\nword\tB-LOC\nword\tI-LOC\nword\tB-LOC\nword\tI-LOC\nword\tO\n" evaluator = Evaluator(true, pred, tags=['LOC', 'PER'], loader="conll") results, results_agg = evaluator.evaluate() expected = { 'strict': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'ent_type': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'partial': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, }, 'exact': { 'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 3, 'actual': 3, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, } } assert results['strict'] == expected['strict'] assert results['ent_type'] == expected['ent_type'] assert results['partial'] == expected['partial'] assert results['exact'] == expected['exact']