Exemplo n.º 1
0
 def __init__(self):
     self.micro = results_pb2.Stats()
     self.macro = _MacroStats()
     # Map from info type name to Stats pb.
     self.per_type = collections.defaultdict(results_pb2.Stats)
     self.typeless_micro = results_pb2.Stats()
     self.typeless_macro = _MacroStats()
Exemplo n.º 2
0
    def testStrictMatching(self):
        finding = eval_lib.Finding
        findings = set([
            finding('TYPE_A', 0, 3, 'one'),
            finding('TYPE_B', 5, 8, 'two'),
            finding('TYPE_C', 20, 25, 'three'),
            finding('TYPE_D', 30, 34, 'four')
        ])
        golden_findings = set([
            finding('TYPE_A', 0, 3, 'hit'),
            finding('TYPE_B', 7, 10, 'hit'),
            finding('TYPE_C', 25, 29, 'miss'),
            finding('TYPE_E', 30, 34, 'wrong type')
        ])
        result = eval_lib.count_matches(findings,
                                        golden_findings,
                                        record_id='',
                                        strict=True,
                                        ignore_type=False)

        expected_stats = results_pb2.Stats()
        expected_stats.true_positives = 1
        expected_stats.false_positives = 3
        expected_stats.false_negatives = 3
        expected_stats.precision = 0.25
        expected_stats.recall = 0.25
        expected_stats.f_score = 0.25
        self.assertEqual(normalize_floats(expected_stats),
                         normalize_floats(result.stats))

        expected_typeless_stats = results_pb2.Stats()
        expected_typeless_stats.true_positives = 2
        expected_typeless_stats.false_positives = 2
        expected_typeless_stats.false_negatives = 2
        expected_typeless_stats.precision = 0.5
        expected_typeless_stats.recall = 0.5
        expected_typeless_stats.f_score = 0.5
        self.assertEqual(normalize_floats(expected_typeless_stats),
                         normalize_floats(result.typeless))

        a = results_pb2.Stats()
        a.true_positives = 1
        b = results_pb2.Stats()
        b.false_positives = 1
        b.false_negatives = 1
        c = results_pb2.Stats()
        c.false_positives = 1
        c.false_negatives = 1
        d = results_pb2.Stats()
        d.false_positives = 1
        e = results_pb2.Stats()
        e.false_negatives = 1
        expected_per_type = {
            'TYPE_A': a,
            'TYPE_B': b,
            'TYPE_C': c,
            'TYPE_D': d,
            'TYPE_E': e
        }
        self.assertEqual(expected_per_type, result.per_type)
Exemplo n.º 3
0
    def testCalculateStats(self):
        stats = results_pb2.Stats()
        stats.true_positives = 12
        stats.false_positives = 8
        stats.false_negatives = 3
        eval_lib.calculate_stats(stats)
        self.assertAlmostEqual(.6, stats.precision)
        self.assertAlmostEqual(.8, stats.recall)
        self.assertAlmostEqual(.6857142857142856, stats.f_score)

        stats = results_pb2.Stats()
        eval_lib.calculate_stats(stats)
        self.assertTrue(math.isnan(stats.precision))
        self.assertTrue(math.isnan(stats.recall))
        self.assertTrue(math.isnan(stats.f_score))
        self.assertEqual(
            'Precision has denominator of zero. Recall has denominator of zero. '
            'f-score is NaN', stats.error_message)
Exemplo n.º 4
0
    def testIntervalsCountNotExactMatch(self):
        finding = eval_lib.Finding
        findings = set([
            finding('NAME', 1, 8, 'he quic'),  # Golden contains.
            finding('NAME', 10, 19, 'brown fox'),  # Golden contained.
            finding('NAME', 20, 30, 'jumps over')  # Intersection.
        ])
        golden_findings = set([
            finding('NAME', 0, 9, 'The quick'),  # Golden contains.
            finding('NAME', 11, 18, 'rown fo'),  # Golden contained.
            finding('NAME', 26, 34, 'over the')  # Intersection.
        ])
        result = eval_lib.intervals_count_compare(findings,
                                                  golden_findings,
                                                  record_id='')

        expected_typeless = results_pb2.Stats()
        expected_typeless.true_positives = 3
        expected_typeless.false_positives = 3
        expected_typeless.false_negatives = 3
        expected_typeless.precision = 0.5
        expected_typeless.recall = 0.5
        expected_typeless.f_score = 0.5
        self.assertEqual(normalize_floats(expected_typeless),
                         normalize_floats(result.typeless))

        expected_total = results_pb2.Stats()
        expected_total.true_positives = 3
        expected_total.false_positives = 3
        expected_total.false_negatives = 3
        expected_total.precision = 0.5
        expected_total.recall = 0.5
        expected_total.f_score = 0.5
        self.assertEqual(normalize_floats(expected_total),
                         normalize_floats(result.stats))

        expected_name = results_pb2.Stats()
        expected_name.true_positives = 3
        expected_name.false_positives = 3
        expected_name.false_negatives = 3
        expected_per_type = {'NAME': expected_name}
        self.assertEqual(expected_per_type, result.per_type)
Exemplo n.º 5
0
    def testCharactersCountIgnoringNonAlphanumerics(self):
        finding = eval_lib.Finding
        findings = set([
            finding('NAME', 0, 9, 'The quick'),
            finding('ID', 10, 19, 'brown fox'),
            finding('ORGANIZATION', 20, 30, 'jumps over')
        ])
        golden_findings = set([
            finding('NAME', 0, 9, 'The quick'),
            finding('AGE', 10, 19, 'brown fox'),
            finding('DATE', 35, 43, 'lazy dog')
        ])
        result = eval_lib.characters_count_compare(
            findings,
            golden_findings,
            record_id='',
            ignore_nonalphanumerics=True)

        expected_typeless = results_pb2.Stats()
        expected_typeless.true_positives = 16
        expected_typeless.false_positives = 9
        expected_typeless.false_negatives = 7
        expected_typeless.precision = 0.64
        expected_typeless.recall = 0.695652
        expected_typeless.f_score = 0.666667
        self.assertEqual(normalize_floats(expected_typeless),
                         normalize_floats(result.typeless))

        expected_total = results_pb2.Stats()
        expected_total.true_positives = 8
        expected_total.false_positives = 17
        expected_total.false_negatives = 15
        expected_total.precision = 0.32
        expected_total.recall = 0.347826
        expected_total.f_score = 0.333333
        self.assertEqual(normalize_floats(expected_total),
                         normalize_floats(result.stats))

        expected_name = results_pb2.Stats()
        expected_name.true_positives = 8
        expected_id = results_pb2.Stats()
        expected_id.false_positives = 8
        expected_age = results_pb2.Stats()
        expected_age.false_negatives = 8
        expected_org = results_pb2.Stats()
        expected_org.false_positives = 9
        expected_date = results_pb2.Stats()
        expected_date.false_negatives = 7
        expected_per_type = {
            'NAME': expected_name,
            'ID': expected_id,
            'AGE': expected_age,
            'ORGANIZATION': expected_org,
            'DATE': expected_date
        }
        self.assertEqual(expected_per_type, result.per_type)
Exemplo n.º 6
0
    def testTypedTokensCount(self):
        finding = eval_lib.Finding
        findings = set([
            finding('NAME', 0, 9, 'The quick'),
            finding('ID', 10, 19, 'brown fox'),
            finding('ORGANIZATION', 20, 30, 'jumps over')
        ])
        golden_findings = set([
            finding('NAME', 0, 9, 'The quick'),
            finding('AGE', 10, 19, 'brown fox'),
            finding('DATE', 35, 43, 'lazy dog')
        ])
        result = eval_lib.typed_token_compare(findings,
                                              golden_findings,
                                              record_id='')

        expected_typeless = results_pb2.Stats()
        expected_typeless.true_positives = 4
        expected_typeless.false_positives = 2
        expected_typeless.false_negatives = 2
        expected_typeless.precision = 0.666667
        expected_typeless.recall = 0.666667
        expected_typeless.f_score = 0.666667
        self.assertEqual(normalize_floats(expected_typeless),
                         normalize_floats(result.typeless))

        expected_total = results_pb2.Stats()
        expected_total.true_positives = 2
        expected_total.false_positives = 4
        expected_total.false_negatives = 4
        expected_total.precision = 0.333333
        expected_total.recall = 0.333333
        expected_total.f_score = 0.333333
        self.assertEqual(normalize_floats(expected_total),
                         normalize_floats(result.stats))

        expected_name = results_pb2.Stats()
        expected_name.true_positives = 2
        expected_id = results_pb2.Stats()
        expected_id.false_positives = 2
        expected_age = results_pb2.Stats()
        expected_age.false_negatives = 2
        expected_org = results_pb2.Stats()
        expected_org.false_positives = 2
        expected_date = results_pb2.Stats()
        expected_date.false_negatives = 2
        expected_per_type = {
            'NAME': expected_name,
            'ID': expected_id,
            'AGE': expected_age,
            'ORGANIZATION': expected_org,
            'DATE': expected_date
        }
        self.assertEqual(expected_per_type, result.per_type)
Exemplo n.º 7
0
 def calculate_stats(self):
     """Generate a resuts_pb2.Stats message with the macro-averaged results."""
     stats = results_pb2.Stats()
     if not self.count:
         stats.precision = float('NaN')
         stats.recall = float('NaN')
         stats.f_score = float('NaN')
         stats.error_message = 'Averaging over zero results.'
         return stats
     stats.precision = float(self.precision_sum) / self.count
     stats.recall = float(self.recall_sum) / self.count
     stats.f_score = hmean(stats.precision, stats.recall)
     stats.error_message = self.error_message
     return stats
Exemplo n.º 8
0
 def __init__(self):
     self.record_id = ''
     self.stats = results_pb2.Stats()
     self.per_type = collections.defaultdict(results_pb2.Stats)
     self.typeless = results_pb2.Stats()
     self.debug_info = []
Exemplo n.º 9
0
    def testAccumulateResults(self):
        result1 = eval_lib.IndividualResult()
        result1.stats.true_positives = 30
        result1.stats.false_positives = 20
        result1.stats.false_negatives = 10
        result1.per_type['TypeA'].true_positives = 9
        result1.per_type['TypeA'].false_positives = 8
        result1.per_type['TypeA'].false_negatives = 7
        result1.per_type['TypeB'].true_positives = 6
        result1.per_type['TypeB'].false_positives = 5
        result1.per_type['TypeB'].false_negatives = 4
        result1.typeless.true_positives = 15
        result1.typeless.false_positives = 14
        result1.typeless.false_negatives = 13
        eval_lib.calculate_stats(result1.stats)
        eval_lib.calculate_stats(result1.typeless)

        result2 = eval_lib.IndividualResult()
        result2.stats.true_positives = 3
        result2.stats.false_positives = 2
        result2.stats.false_negatives = 1
        result2.per_type['TypeA'].true_positives = 19
        result2.per_type['TypeA'].false_positives = 18
        result2.per_type['TypeA'].false_negatives = 17
        result2.per_type['TypeB'].true_positives = 16
        result2.per_type['TypeB'].false_positives = 15
        result2.per_type['TypeB'].false_negatives = 14
        result2.typeless.true_positives = 13
        result2.typeless.false_positives = 12
        result2.typeless.false_negatives = 11
        eval_lib.calculate_stats(result2.stats)
        eval_lib.calculate_stats(result2.typeless)

        ar = eval_lib.AccumulatedResults()
        ar.add_result(result1)
        ar.add_result(result2)

        expected_micro = results_pb2.Stats()
        expected_micro.true_positives = 33
        expected_micro.false_positives = 22
        expected_micro.false_negatives = 11
        self.assertEqual(expected_micro, ar.micro)

        expected_macro = results_pb2.Stats()
        expected_macro.precision = 0.6
        expected_macro.recall = 0.75
        expected_macro.f_score = 0.666667
        self.assertEqual(normalize_floats(expected_macro),
                         normalize_floats(ar.macro.calculate_stats()))

        expected_type_a = results_pb2.Stats()
        expected_type_a.true_positives = 28
        expected_type_a.false_positives = 26
        expected_type_a.false_negatives = 24
        expected_type_b = results_pb2.Stats()
        expected_type_b.true_positives = 22
        expected_type_b.false_positives = 20
        expected_type_b.false_negatives = 18
        expected_per_type = {
            'TypeA': expected_type_a,
            'TypeB': expected_type_b
        }
        self.assertEqual(expected_per_type, ar.per_type)

        expected_typeless_micro = results_pb2.Stats()
        expected_typeless_micro.true_positives = 28
        expected_typeless_micro.false_positives = 26
        expected_typeless_micro.false_negatives = 24
        self.assertEqual(expected_typeless_micro, ar.typeless_micro)

        expected_typeless_macro = results_pb2.Stats()
        expected_typeless_macro.precision = 0.518621
        expected_typeless_macro.recall = 0.53869
        expected_typeless_macro.f_score = 0.528465
        self.assertEqual(normalize_floats(expected_typeless_macro),
                         normalize_floats(ar.typeless_macro.calculate_stats()))