def test_seq_correct_score(self): metric = explainaboard.metrics.accuracy.SeqCorrectCountConfig( name='SeqCorrectCount' ).to_metric() true = [ { "start_idx": [8, 17, 39, 46, 58, 65, 65, 80], "end_idx": [8, 18, 40, 47, 59, 65, 66, 81], "corrections": [ ["the"], ["found"], ["other"], ["there"], ["chickens."], ["in"], ["which"], ["selling"], ], } ] pred = [ { "start_idx": [8, 17, 39, 46, 58], "end_idx": [8, 18, 40, 47, 59], "corrections": [ ["the"], ["found"], ["other"], ["there"], ["chickens."], ], } ] result = metric.evaluate(true, pred) self.assertAlmostEqual(result.value, 5)
def test_correct_score(self): metric = explainaboard.metrics.accuracy.CorrectCountConfig( name='CorrectCount' ).to_metric() true = ['a', 'b', 'a', 'b', 'a', 'b'] pred = ['a', 'b', 'a', 'b', 'b', 'a'] result = metric.evaluate(true, pred, conf_value=0.05) self.assertAlmostEqual(result.value, 4)
def test_accuracy(self): metric = explainaboard.metrics.accuracy.AccuracyConfig( name='Accuracy' ).to_metric() true = ['a', 'b', 'a', 'b', 'a', 'b'] pred = ['a', 'b', 'a', 'b', 'b', 'a'] result = metric.evaluate(true, pred, conf_value=0.05) self.assertAlmostEqual(result.value, 2.0 / 3.0)
def test_mrr(self): metric = explainaboard.metrics.ranking.MeanReciprocalRankConfig( name='MRR' ).to_metric() true = ['a', 'b', 'a', 'b', 'a', 'b'] pred = [['a', 'b'], ['c', 'd'], ['c', 'a'], ['a', 'c'], ['b', 'a'], ['a', 'b']] result = metric.evaluate(true, pred, conf_value=0.05) self.assertAlmostEqual(result.value, 2.5 / 6.0)
def test_f1_macro(self): metric = explainaboard.metrics.f1_score.F1ScoreConfig( name='F1', average='macro' ).to_metric() true = ['a', 'b', 'a', 'b', 'a', 'a', 'c', 'c'] pred = ['a', 'b', 'a', 'b', 'b', 'a', 'c', 'a'] sklearn_f1 = sklearn.metrics.f1_score(true, pred, average='macro') result = metric.evaluate(true, pred, conf_value=None) self.assertAlmostEqual(result.value, sklearn_f1)
def test_ner_f1(self): true = [ ['O', 'O', 'B-MISC', 'I-MISC', 'B-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O'], ] pred = [ ['O', 'O', 'B-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O'], ] metric = explainaboard.metrics.f1_score.SeqF1ScoreConfig( name='MicroF1', average='micro', tag_schema='bio' ).to_metric() result = metric.evaluate(true, pred, conf_value=None) self.assertAlmostEqual(result.value, 2.0 / 3.0) metric = explainaboard.metrics.f1_score.SeqF1ScoreConfig( name='MacroF1', average='macro', tag_schema='bio' ).to_metric() result = metric.evaluate(true, pred, conf_value=None) self.assertAlmostEqual(result.value, 3.0 / 4.0)
def test_hits(self): metric = explainaboard.metrics.ranking.HitsConfig(name='Hits').to_metric() true = ['a', 'b', 'a', 'b', 'a', 'b'] pred = [['a', 'b'], ['c', 'd'], ['c', 'a'], ['a', 'c'], ['b', 'a'], ['a', 'b']] result = metric.evaluate(true, pred, conf_value=0.05) self.assertAlmostEqual(result.value, 4.0 / 6.0)