def test_P(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run 1 0 D0 5 0.1 run ''')) measure = ir_measures.P @ 5 result = measure.calc_aggregate(qrels, run) self.assertEqual(result, 0.5)
def main_cli(): parser = argparse.ArgumentParser() parser.add_argument('qrels') parser.add_argument('run') parser.add_argument('measures', nargs='+') parser.add_argument('--places', '-p', type=int, default=DEFAULT_PLACES) parser.add_argument('--by_query', '-q', action='store_true') parser.add_argument('--no_summary', '-n', action='store_true') parser.add_argument('--output_format', '-o', choices=OUTPUT_FORMATS.keys(), default='tsv') parser.add_argument('--provider', choices=ir_measures.providers.registry.keys()) args = parser.parse_args() qrels = _get_qrels(args) run = ir_measures.read_trec_run(args.run) measures = _get_measures(args) calc_obj = ir_measures if args.provider: calc_obj = ir_measures.providers.registry[args.provider] output = OUTPUT_FORMATS[args.output_format](args) if args.by_query: aggs = {m: m.aggregator() for m in measures} if not args.no_summary else None for result in calc_obj.iter_calc(measures, qrels, run): output(result) if aggs: aggs[result.measure].add(result.value) if aggs: for measure in measures: output(Metric(query_id=SUMMARY_QID, measure=measure, value=aggs[measure].result())) else: assert not args.no_summary, "--no_summary (-n) only supported with --by_query (-q)" results = calc_obj.calc_aggregate(measures, qrels, run) for measure in measures: output(Metric(query_id=SUMMARY_QID, measure=measure, value=results[measure]))
def test_NumQ(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.NumQ result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 1) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 1) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 2)
def test_accuracy(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 C0_1 0 0 0 B0_1 1 0 0 A0_1 2 1 0 A1_1 2 1 0 A1_2 2 1 0 B1_1 1 1 0 C1_2 0 2 0 B1_1 1 ''')) run = list( ir_measures.read_trec_run(''' 0 0 C0_1 1 0.4 run 0 0 A0_1 2 0.3 run 0 0 C0_2 3 0.2 run 0 0 C0_3 4 0.1 run 1 0 C1_1 1 0.8 run 1 0 A1_1 2 0.7 run 1 0 C1_2 3 0.6 run 1 0 B1_1 4 0.5 run 1 0 C1_3 5 0.4 run 2 0 B1_1 2 0.2 run 2 0 C1_1 3 0.1 run ''')) provider = ir_measures.accuracy accuracy_1 = Accuracy(rel=1) results_1 = [('0', 2. / 3.), ('1', .5 * (2 / 3. + 1. / 3)), ('2', 1.)] expected_results = [ [accuracy_1, results_1], [Accuracy(rel=2), [('0', 2. / 3), ('1', 0.75)]], ] for measure, expected in expected_results: with self.subTest(measure=measure): self.assertTrue(provider.supports(measure)) results = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(len(results), len(expected), "result lists length differ") for result, (query_id, value) in zip(results, expected): self.assertEqual(result.query_id, query_id) self.assertAlmostEqual(result.value, value, delta=1e-9, msg=f"for query {query_id}") expected = sum(value for _, value in results_1) / len(results_1) self.assertAlmostEqual(provider.calc_aggregate([accuracy_1], qrels, run)[accuracy_1], expected, delta=1e-9)
def test_nDCG(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.gdeval measure = ir_measures.nDCG @ 20 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.6201) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.35099) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.485545) self.assertEqual( provider.evaluator([measure], qrels).calc_aggregate(run)[measure], 0.485545) measure = ir_measures.nDCG @ 2 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.17377) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.38685) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.28031) ev = provider.evaluator([ir_measures.nDCG @ 20, ir_measures.nDCG @ 2], qrels) res = ev.calc_aggregate(run) self.assertEqual(res[ir_measures.nDCG @ 20], 0.485545) self.assertEqual(res[ir_measures.nDCG @ 2], 0.28031) res = ev.calc_aggregate(run) self.assertEqual(res[ir_measures.nDCG @ 20], 0.485545) self.assertEqual(res[ir_measures.nDCG @ 2], 0.28031)
def test_SetAP(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 2 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.SetAP(rel=1) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.6) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.125) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.3625) measure = ir_measures.SetAP(rel=2) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.25) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.325) measure = ir_measures.SetAP(rel=3) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0)
def test_measures(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run ''')) measures = ir_measures.util.flatten_measures([ ir_measures.P(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100], ir_measures.R(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100], ir_measures.RR(rel=[1, 2]), ir_measures.RR(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100], ir_measures.Rprec(rel=[1, 2]), ir_measures.AP(rel=[1, 2]), ir_measures.AP(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100], ir_measures.nDCG(dcg=['log2', 'exp-log2']), ir_measures.nDCG(dcg=['log2', 'exp-log2']) @ [1, 5, 10, 20, 50, 100], ir_measures.Bpref(rel=[1, 2]), ir_measures.Judged @ [1, 5, 10, 20, 50, 100], ir_measures.ERR @ [1, 5, 10, 20, 50, 100], #disable RBP #ir_measures.RBP(p=[0.5, 0.8, 1.0, 1.2, 1.5]), #ir_measures.RBP(p=[0.5, 0.8, 1.0, 1.2, 1.5])@[1,5,10,20,50,100], ]) providers = [ v for k, v in ir_measures.providers.registry.items() if k != 'trectools' ] for measure in measures: values = [(next(p.iter_calc([measure], qrels, run)), p) for p in providers if p.supports(measure)] print(measure, len(values)) for (v1, p1), (v2, p2) in itertools.combinations(values, 2): with self.subTest(measure=measure, p1=p1, p2=p2): self.assertAlmostEqual(v1.value, v2.value, places=4, msg=str(measure))
def test_measures(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 1 D2 1 0 1 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 1 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) measures = [ ERR_IA @ 5, ERR_IA(rel=2) @ 10, nERR_IA @ 5, nERR_IA(rel=2) @ 10, alpha_DCG @ 5, alpha_nDCG @ 5, NRBP, NRBP(rel=2), nNRBP, nNRBP(rel=2), AP_IA, AP_IA(rel=2), P_IA @ 5, P_IA(rel=2) @ 10, StRecall @ 5, StRecall(rel=2) @ 10, ] ir_measures.pyndeval.calc_aggregate(measures, qrels, run)
def test_nDCG(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 1 0 0 D1 -1 0 0 D2 0 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D4 -1 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.pytrec_eval measure = ir_measures.nDCG self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.76018), Metric(query_id='1', measure=measure, value=0.32739) ]) measure = ir_measures.nDCG @ 3 self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.76018), Metric(query_id='1', measure=measure, value=0.0) ]) measure = ir_measures.nDCG(gains={0: 1, 1: 4}) self.assertMetrics(provider.iter_calc([measure], qrels, run), [ Metric(query_id='0', measure=measure, value=0.97177), Metric(query_id='1', measure=measure, value=0.14949) ])
def test_ERR(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.gdeval measure = ir_measures.ERR @ 20 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.10175) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.09375) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.09775) measure = ir_measures.ERR @ 2 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.03125) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.09375) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.0625)
def test_measures(self): qrels = list( ir_measures.read_trec_qrels( os.path.join(os.path.dirname(__file__), 'compat.qrels'))) run = list( ir_measures.read_trec_run( os.path.join(os.path.dirname(__file__), 'compat.run'))) provider = ir_measures.compat # based on a manual execution of https://github.com/claclark/Compatibility expected_results = [ [ Compat(p=0.95), [('31_1', 0.51779512165509), ('31_2', 0.018400100569017922)] ], [ Compat(p=0.9), [('31_1', 0.3761334522946854), ('31_2', 0.004344079941789211)] ], [ Compat(p=0.8), [('31_1', 0.16723008845234535), ('31_2', 0.00022806427320561776)] ], ] for measure, expected in expected_results: with self.subTest(measure=measure): self.assertTrue(provider.supports(measure)) results = list(provider.iter_calc([measure], qrels, run)) for result, (query_id, value) in zip(results, expected): self.assertAlmostEqual(result.query_id, query_id, delta=1e-9) self.assertAlmostEqual(result.value, value, delta=1e-9) self.assertAlmostEqual(provider.calc_aggregate([Compat(p=0.95)], qrels, run)[Compat(p=0.95)], 0.268097611, delta=1e-9)
def test_ERR_IA(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 1 D2 1 0 1 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 1 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.pyndeval measure = ir_measures.ERR_IA @ 20 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.4659, places=4) self.assertEqual(result[1].query_id, "1") self.assertAlmostEqual(result[1].value, 0.1803, places=4) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.3231, places=4)
def test_P(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run ''')) measure = ir_measures.P @ 5 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.P(rel=2) @ 5 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.RR result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.RR(rel=2) result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Rprec result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Rprec(rel=2) result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.AP result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.AP @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.AP(rel=2) result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.AP(rel=2) @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.nDCG result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.nDCG @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.R @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.R(rel=2) @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Bpref result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Bpref(rel=2) result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.P(rel=(1, 2)) @ (1, 5, 10, 20) result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Judged @ 5 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.Judged @ 20 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.ERR @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.ERR @ 20 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.nDCG(dcg='exp-log2') @ 2 result = list(measure.iter_calc(qrels, run)) print(result) measure = ir_measures.nDCG(dcg='exp-log2') @ 5 result = list(measure.iter_calc(qrels, run)) print(result)
def test_P(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run ''')) measure = ir_measures.P @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6)) measure = ir_measures.P(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.2)) measure = ir_measures.SetP result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6)) measure = ir_measures.SetP(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.2)) measure = ir_measures.R @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.R(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.R @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.R(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.0)) measure = ir_measures.SetR result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.SetR(rel=2) @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.0)) measure = ir_measures.RR result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.RR @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.RR @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.5)) measure = ir_measures.RR(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.)) measure = ir_measures.AP result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888)) measure = ir_measures.AP(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.AP @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888)) measure = ir_measures.AP(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333)) measure = ir_measures.AP @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.16666666666666666)) measure = ir_measures.AP(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.0)) measure = ir_measures.Success @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success(rel=2) @ 10 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 1.)) measure = ir_measures.Success(rel=2) @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.)) measure = ir_measures.NumRet(rel=1) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 3.)) measure = ir_measures.NumRet(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) measure = ir_measures.nDCG result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067)) measure = ir_measures.nDCG @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067)) measure = ir_measures.nDCG @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.23981246656813146)) measure = ir_measures.nDCG(dcg='exp-log2') result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453)) measure = ir_measures.nDCG(dcg='exp-log2') @ 5 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453)) measure = ir_measures.nDCG(dcg='exp-log2') @ 2 result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.17376534287144002)) measure = ir_measures.Rprec result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.6666666666666666)) measure = ir_measures.Rprec(rel=2) result = list(ir_measures.ranx.iter_calc([measure], qrels, run)) self.assertEqual(result[0], Metric('0', measure, 0.))
def test_empty(self): qrels = list(ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) partial_qrels = [q for q in qrels if q.query_id == '0'] run = list(ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) partial_run = [r for r in run if r.query_id == '0'] empty = [] # qrels but no run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set()) # qrels but partial run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)}) # run but no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), []) # run but partial qrels self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)}) # both no run and no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), []) # qrels but no run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')}) # qrels but partial run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5}) # run but no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')}) # run but partial qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5}) # both no run and no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
def test_measures(self): qrels = list( ir_measures.read_trec_qrels( os.path.join(os.path.dirname(__file__), 'cwl.qrels'))) run = list( ir_measures.read_trec_run( os.path.join(os.path.dirname(__file__), 'cwl.run'))) provider = ir_measures.cwl_eval # based on a manual execution of cwl-eval expected_results = [ [AP, [('T1', 0.7087), ('T2', 0.7438), ('T3', 0.3068)]], [ BPM(T=1.0, max_rel=1) @ 20, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)] ], [ BPM(T=2.0, max_rel=1) @ 10, [('T1', 0.6667), ('T2', 1.0000), ('T3', 0.2857)] ], [ INSQ(T=1.0, max_rel=1), [('T1', 0.5872), ('T2', 0.6629), ('T3', 0.0880)] ], [ INSQ(T=2.0, max_rel=1), [('T1', 0.4513), ('T2', 0.5068), ('T3', 0.1292)] ], [ INST(T=1.0, max_rel=1), [('T1', 0.7934), ('T2', 0.9226), ('T3', 0.0888)] ], [ INST(T=2.0, max_rel=1), [('T1', 0.5994), ('T2', 0.6924), ('T3', 0.1397)] ], [ SDCG(max_rel=1) @ 10, [('T1', 0.5645), ('T2', 0.6531), ('T3', 0.2848)] ], [ NERR8(max_rel=1) @ 10, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)] ], [ NERR9(max_rel=1) @ 10, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0680)] ], [ NERR10(p=0.8, max_rel=1), [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0888)] ], [ NERR11(T=2.0, max_rel=1), [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0691)] ], [P @ 1, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0000)]], [P @ 10, [('T1', 0.5000), ('T2', 0.6000), ('T3', 0.4000)]], [P @ 20, [('T1', 0.2500), ('T2', 0.3000), ('T3', 0.2000)]], [P @ 5, [('T1', 0.6000), ('T2', 0.6000), ('T3', 0.0000)]], [ RBP(p=0.9, rel=1), [('T1', 0.3501), ('T2', 0.3996), ('T3', 0.1988)] ], [RR, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)]], ] for measure, expected in expected_results: with self.subTest(measure=measure): self.assertTrue(provider.supports(measure)) results = list(provider.iter_calc([measure], qrels, run)) for result, (query_id, value) in zip(results, expected): self.assertAlmostEqual(result.query_id, query_id, delta=0.0001) self.assertAlmostEqual(result.value, value, delta=0.0001)
def test_SetF(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 2 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.SetF(rel=1) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.75, places=4) self.assertEqual(result[1].query_id, "1") self.assertAlmostEqual(result[1].value, .33333, places=4) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.5417, places=4) measure = ir_measures.SetF(rel=1, beta=0.5) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.6923, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.3) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.49615, places=4) measure = ir_measures.SetF(rel=1, beta=2.0) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.81818, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.375) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.59659, places=4) measure = ir_measures.SetF(rel=3) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0) # make sure the multiple invocations hapen correctly res = provider.calc_aggregate([ ir_measures.SetF(rel=1), ir_measures.SetF(rel=1, beta=0.5), ir_measures.SetF(rel=1, beta=2.0), ir_measures.SetF(rel=3) ], qrels, run) self.assertAlmostEqual(res[ir_measures.SetF(rel=1)], 0.5417, places=4) self.assertAlmostEqual(res[ir_measures.SetF(rel=1, beta=0.5)], 0.49615, places=4) self.assertAlmostEqual(res[ir_measures.SetF(rel=1, beta=2.0)], 0.59659, places=4) self.assertEqual(res[ir_measures.SetF(rel=3)], 0)
def test_define(self): def my_p(qrels, run): run = run.merge(qrels, 'left', on=['query_id', 'doc_id']) for qid, df in run.groupby('query_id'): yield qid, (df['relevance'] > 0).sum() / len(df) def my_s(qrels, run): run = run.merge(qrels, 'left', on=['query_id', 'doc_id']) for qid, df in run.groupby('query_id'): yield qid, 1. if (df['relevance'] > 0).sum() else 0. MyP = ir_measures.define(my_p) MyS = ir_measures.define(my_s) qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 0 1 0 D5 2 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) result = list((MyP @ 1).iter_calc(qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.) self.assertEqual((MyP @ 1).calc_aggregate(qrels, run), 0.0) result = list((MyP @ 2).iter_calc(qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.5) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.) self.assertEqual((MyP @ 2).calc_aggregate(qrels, run), 0.25) result = list((MyP @ 3).iter_calc(qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 0.6666666666666666) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.) self.assertEqual((MyP @ 3).calc_aggregate(qrels, run), 0.3333333333333333) result = list((MyS @ 2).iter_calc(qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 1.) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.) self.assertEqual((MyS @ 2).calc_aggregate(qrels, run), 0.5)
def test_IPrec(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 1 0 0 D1 1 0 0 D2 2 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.IPrec @ 0.25 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 1.0) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.625) measure = ir_measures.IPrec @ 0.5 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 1.0) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.625) measure = ir_measures.IPrec @ 0.75 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertEqual(result[0].value, 1.0) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0) self.assertEqual( provider.calc_aggregate([measure], qrels, run)[measure], 0.5) measure = ir_measures.IPrec(rel=2) @ 0.1 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, .6666666, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.4583, places=4) measure = ir_measures.IPrec(rel=2) @ 0.25 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, .6666666, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.4583, places=4) measure = ir_measures.IPrec(rel=2) @ 0.5 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, .6666666, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.4583, places=4) measure = ir_measures.IPrec(rel=2) @ 0.75 result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, .6666666, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .25) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.4583, places=4)
def test_infAP(self): qrels = list( ir_measures.read_trec_qrels(''' 0 0 D0 1 0 0 D1 -1 0 0 D2 0 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D4 -1 1 0 D5 0 ''')) run = list( ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D4 2 0.7 run 1 0 D3 3 0.3 run 1 0 D2 4 0.4 run ''')) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.AP result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.8333, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, .125) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.47916666666666663, places=4) measure = ir_measures.infAP result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.8333, places=4) self.assertEqual(result[1].query_id, "1") self.assertEqual(result[1].value, 0.1875) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.510416, places=4) provider = ir_measures.providers.PytrecEvalProvider() measure = ir_measures.AP(rel=2) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.33333, places=4) self.assertEqual(result[1].query_id, "1") self.assertAlmostEqual(result[1].value, 0.25, places=4) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.2917, places=4) measure = ir_measures.infAP(rel=2) result = list(provider.iter_calc([measure], qrels, run)) self.assertEqual(result[0].query_id, "0") self.assertAlmostEqual(result[0].value, 0.33333, places=4) self.assertEqual(result[1].query_id, "1") self.assertAlmostEqual(result[1].value, 0.375, places=4) self.assertAlmostEqual(provider.calc_aggregate([measure], qrels, run)[measure], 0.3542, places=4)