class ProteinEvaluator(AbstractEvaluator): """Implementation of the evaluation process for INT and IPT.""" def reset(self): """Reset the internal state to reuse the evaluator.""" self.primary_eval = ProteinEvaluation() self.secondary_eval = ProteinMacroEvaluation() self.results = None self.gold_standard = None self.logger = logging.getLogger("ProteinEvaluator") self._dois = None def _prepare(self): """Prepare the instance for the evaluation run.""" assert len(self.results) == len(self.gold_standard), \ "the entries in the evaluation result and the gold standard " \ "do not match" self.primary_eval.set_fn(self.gold_standard.true_items()) self.logger.debug( "INT/IPT evaluation: %i GS annotations" % self.primary_eval.hits.fn ) def _process(self): """Process the result set.""" self._dois = self.results.keys() self._dois.sort() result_sizes = [ len(result_list) for result_list in self.results.values() ] max_rank_in_results = max(result_sizes) if len(result_sizes) else 0 self.logger.info("longest result set has %i annotations", max_rank_in_results) if self.cutoff and self.cutoff < max_rank_in_results: max_rank_in_results = self.cutoff for doi in list(self._dois): std_items = self.gold_standard[doi] result_doc = ProteinEvaluation(doi=doi, fn=len(std_items)) self.secondary_eval[doi] = result_doc for rank in range(max_rank_in_results): for doi in list(self._dois): self._process_doi(doi, rank) # Calculate & store the average P/R pair # at this rank over all documents (macro-averaging) self.secondary_eval.store_p_at_current_r() # Calculate & store the current P/R value # at this rank over all documents (micro-averaging) self.primary_eval.store_p_at_current_r() def _process_doi(self, doi, rank): """Evaluate the result at a given rank for a document.""" result_items = self.results[doi] std_items = self.gold_standard.get(doi) # special syntax for mocking try: item = result_items[rank] except IndexError: # no more results for this DOI self._dois.remove(doi) else: if item.confidence is not None and \ item.confidence < self.min_conf: self._dois.remove(doi) # confidence-base cutoff else: # evaluate the result at the current rank self.primary_eval.evaluate_item(item, std_items) self.secondary_eval[doi].evaluate_item(item, std_items) self.secondary_eval[doi].store_p_at_current_r()
class ProteinEvaluationTest(CalculationAssertions): @patch('biocreative.evaluation.calculation.hits.Hits', spec=True) def setUp(self, unused): self.evaluator = ProteinEvaluation() for attr in C.HITS_ATTRIBUTES: setattr(self.evaluator.hits, attr, 2) def test_f_score(self): self.assert_property("f_score", 0.5) def test_evaluate_tp_item(self): self.evaluator.evaluate_item(1, [0,1,2]) self.assert_hits(self.evaluator.hits, tp=3, fp=2, fn=1) def test_evaluate_fp_item(self): self.evaluator.evaluate_item(3, [0,1,2]) self.assert_hits(self.evaluator.hits, tp=2, fp=3, fn=2) def test_evaluate_item_with_illegal_std_items(self): self.assertRaises( AssertionError, self.evaluator.evaluate_item, 1, (0,1,2) ) def test_evaluate(self): self.evaluator.evaluate_item = Mock() self.evaluator.store_p_at_current_r = Mock() gs_set = [0,1,2] self.evaluator.evaluate([3,4,1,2], gs_set, 3) self.assertTrue(self.evaluator.store_p_at_current_r.called) self.assertEqual(self.evaluator.store_p_at_current_r.call_count, 3) self.assertTrue(self.evaluator.evaluate_item.called) self.assertEqual(self.evaluator.evaluate_item.call_count, 3) arg_list = self.evaluator.evaluate_item.call_args_list exp_list = (((3, gs_set), {}), ((4, gs_set), {}), ((1, gs_set), {})) for call, args in enumerate(arg_list): self.assert_values( "evaluate_item call %i" % (call + 1), exp_list[call], args ) def test_evaluate_with_illegal_result_items(self): self.assertRaises( AssertionError, self.evaluator.evaluate, (1,2,3), set([0,1,2]), 2 ) def set_up_avrg_p_test(self): for hits in ( {'tp': 1, 'fp': 0, 'fn': 2, 'tn': 0}, # p=1.0, r=0.33 {'tp': 1, 'fp': 1, 'fn': 2, 'tn': 0}, # p=0.5, r=0.33 {'tp': 2, 'fp': 1, 'fn': 1, 'tn': 0}, # p=0.66, r=0.66 {'tp': 2, 'fp': 2, 'fn': 1, 'tn': 0}, # p=0.5, r=0.66 {'tp': 2, 'fp': 3, 'fn': 1, 'tn': 0}, # p=0.4, r=0.66 {'tp': 3, 'fp': 3, 'fn': 0, 'tn': 0}, # p=0.5, r=1.0 ): for attr, value in hits.items(): setattr(self.evaluator.hits, attr, value) self.evaluator.store_p_at_current_r() self.p_at_full_r = 0.5 self.avrg_p_values = [(1/1.0, 1/3.0), (2/3.0, 2/3.0), (3/6.0, 3/3.0)] self.pr_values = ( (1/1.0, 1/3.0), (1/2.0, 1/3.0), (2/3.0, 2/3.0), (2/5.0, 2/3.0), (3/6.0, 3/3.0), ) self.avrg_p = 0.0 last_r = 0.0 for p, r in self.avrg_p_values: self.avrg_p += p * (r - last_r) last_r = r def test_avrg_p_properties(self): self.set_up_avrg_p_test() self.assert_property("p_at_full_r", self.p_at_full_r) self.assert_property("avrg_p", self.avrg_p) def test_pr_values(self): self.set_up_avrg_p_test() pr_values = tuple(self.evaluator.yield_precision_recall_pairs()) self.assertEqual(pr_values, self.pr_values)