예제 #1
0
    def test_NumQ(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.NumQ
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 1)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 1)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 2)
예제 #2
0
    def test_P(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
1 0 D0 5 0.1 run
'''))
        measure = ir_measures.P @ 5
        result = measure.calc_aggregate(qrels, run)
        self.assertEqual(result, 0.5)
예제 #3
0
    def test_accuracy(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 C0_1 0
0 0 B0_1 1
0 0 A0_1 2

1 0 A1_1 2
1 0 A1_2 2
1 0 B1_1 1
1 0 C1_2 0

2 0 B1_1 1
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 C0_1 1 0.4 run
0 0 A0_1 2 0.3 run
0 0 C0_2 3 0.2 run
0 0 C0_3 4 0.1 run

1 0 C1_1 1 0.8 run
1 0 A1_1 2 0.7 run
1 0 C1_2 3 0.6 run
1 0 B1_1 4 0.5 run
1 0 C1_3 5 0.4 run

2 0 B1_1 2 0.2 run 
2 0 C1_1 3 0.1 run
'''))
        provider = ir_measures.accuracy

        accuracy_1 = Accuracy(rel=1)
        results_1 = [('0', 2. / 3.), ('1', .5 * (2 / 3. + 1. / 3)), ('2', 1.)]
        expected_results = [
            [accuracy_1, results_1],
            [Accuracy(rel=2), [('0', 2. / 3), ('1', 0.75)]],
        ]
        for measure, expected in expected_results:
            with self.subTest(measure=measure):
                self.assertTrue(provider.supports(measure))
                results = list(provider.iter_calc([measure], qrels, run))
                self.assertEqual(len(results), len(expected),
                                 "result lists length differ")

                for result, (query_id, value) in zip(results, expected):
                    self.assertEqual(result.query_id, query_id)
                    self.assertAlmostEqual(result.value,
                                           value,
                                           delta=1e-9,
                                           msg=f"for query {query_id}")

        expected = sum(value for _, value in results_1) / len(results_1)
        self.assertAlmostEqual(provider.calc_aggregate([accuracy_1], qrels,
                                                       run)[accuracy_1],
                               expected,
                               delta=1e-9)
예제 #4
0
    def test_nDCG(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.gdeval
        measure = ir_measures.nDCG @ 20
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.6201)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.35099)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.485545)
        self.assertEqual(
            provider.evaluator([measure], qrels).calc_aggregate(run)[measure],
            0.485545)

        measure = ir_measures.nDCG @ 2
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.17377)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.38685)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.28031)

        ev = provider.evaluator([ir_measures.nDCG @ 20, ir_measures.nDCG @ 2],
                                qrels)
        res = ev.calc_aggregate(run)
        self.assertEqual(res[ir_measures.nDCG @ 20], 0.485545)
        self.assertEqual(res[ir_measures.nDCG @ 2], 0.28031)
        res = ev.calc_aggregate(run)
        self.assertEqual(res[ir_measures.nDCG @ 20], 0.485545)
        self.assertEqual(res[ir_measures.nDCG @ 2], 0.28031)
예제 #5
0
    def test_SetAP(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 2
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.SetAP(rel=1)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.6)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.125)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.3625)

        measure = ir_measures.SetAP(rel=2)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.25)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.325)

        measure = ir_measures.SetAP(rel=3)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0)
예제 #6
0
    def test_measures(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
'''))
        measures = ir_measures.util.flatten_measures([
            ir_measures.P(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100],
            ir_measures.R(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100],
            ir_measures.RR(rel=[1, 2]),
            ir_measures.RR(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100],
            ir_measures.Rprec(rel=[1, 2]),
            ir_measures.AP(rel=[1, 2]),
            ir_measures.AP(rel=[1, 2]) @ [1, 5, 10, 20, 50, 100],
            ir_measures.nDCG(dcg=['log2', 'exp-log2']),
            ir_measures.nDCG(dcg=['log2', 'exp-log2'])
            @ [1, 5, 10, 20, 50, 100],
            ir_measures.Bpref(rel=[1, 2]),
            ir_measures.Judged @ [1, 5, 10, 20, 50, 100],
            ir_measures.ERR @ [1, 5, 10, 20, 50, 100],
            #disable RBP
            #ir_measures.RBP(p=[0.5, 0.8, 1.0, 1.2, 1.5]),
            #ir_measures.RBP(p=[0.5, 0.8, 1.0, 1.2, 1.5])@[1,5,10,20,50,100],
        ])
        providers = [
            v for k, v in ir_measures.providers.registry.items()
            if k != 'trectools'
        ]
        for measure in measures:
            values = [(next(p.iter_calc([measure], qrels, run)), p)
                      for p in providers if p.supports(measure)]
            print(measure, len(values))
            for (v1, p1), (v2, p2) in itertools.combinations(values, 2):
                with self.subTest(measure=measure, p1=p1, p2=p2):
                    self.assertAlmostEqual(v1.value,
                                           v2.value,
                                           places=4,
                                           msg=str(measure))
예제 #7
0
    def test_measures(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 1 D2 1
0 1 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 1 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        measures = [
            ERR_IA @ 5,
            ERR_IA(rel=2) @ 10,
            nERR_IA @ 5,
            nERR_IA(rel=2) @ 10,
            alpha_DCG @ 5,
            alpha_nDCG @ 5,
            NRBP,
            NRBP(rel=2),
            nNRBP,
            nNRBP(rel=2),
            AP_IA,
            AP_IA(rel=2),
            P_IA @ 5,
            P_IA(rel=2) @ 10,
            StRecall @ 5,
            StRecall(rel=2) @ 10,
        ]
        ir_measures.pyndeval.calc_aggregate(measures, qrels, run)
예제 #8
0
    def test_nDCG(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 1
0 0 D1 -1
0 0 D2 0
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D4 -1
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.pytrec_eval
        measure = ir_measures.nDCG
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.76018),
            Metric(query_id='1', measure=measure, value=0.32739)
        ])

        measure = ir_measures.nDCG @ 3
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.76018),
            Metric(query_id='1', measure=measure, value=0.0)
        ])

        measure = ir_measures.nDCG(gains={0: 1, 1: 4})
        self.assertMetrics(provider.iter_calc([measure], qrels, run), [
            Metric(query_id='0', measure=measure, value=0.97177),
            Metric(query_id='1', measure=measure, value=0.14949)
        ])
예제 #9
0
    def test_ERR(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.gdeval
        measure = ir_measures.ERR @ 20
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.10175)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.09375)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.09775)

        measure = ir_measures.ERR @ 2
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.03125)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.09375)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.0625)
예제 #10
0
def _get_qrels(args):
    # gets the qrels, either from a file (priority) or from ir_datasets (if installed)
    if os.path.exists(args.qrels):
        return ir_measures.read_trec_qrels(args.qrels)
    irds_available = False
    try:
        import ir_datasets
        irds_available = True
    except ImportError:
        sys.stderr.write(f'Skipping ir_datasets lookup. To use this feature, install ir_datasets.\n')
    if irds_available:
        try:
            ds = ir_datasets.load(args.qrels)
            if ds.has_qrels():
                return ds.qrels_iter()
            sys.stderr.write(f'ir_datasets ID {args.qrels} found but does not provide qrels.\n')
            sys.exit(-1)
        except KeyError:
            sys.stderr.write(f'{args.qrels} not found. (checked file and ir_datasets)\n')
            sys.exit(-1)
    sys.stderr.write(f'{args.qrels} not found.\n')
    sys.exit(-1)
예제 #11
0
 def test_measures(self):
     qrels = list(
         ir_measures.read_trec_qrels(
             os.path.join(os.path.dirname(__file__), 'compat.qrels')))
     run = list(
         ir_measures.read_trec_run(
             os.path.join(os.path.dirname(__file__), 'compat.run')))
     provider = ir_measures.compat
     # based on a manual execution of https://github.com/claclark/Compatibility
     expected_results = [
         [
             Compat(p=0.95),
             [('31_1', 0.51779512165509), ('31_2', 0.018400100569017922)]
         ],
         [
             Compat(p=0.9),
             [('31_1', 0.3761334522946854), ('31_2', 0.004344079941789211)]
         ],
         [
             Compat(p=0.8),
             [('31_1', 0.16723008845234535),
              ('31_2', 0.00022806427320561776)]
         ],
     ]
     for measure, expected in expected_results:
         with self.subTest(measure=measure):
             self.assertTrue(provider.supports(measure))
             results = list(provider.iter_calc([measure], qrels, run))
             for result, (query_id, value) in zip(results, expected):
                 self.assertAlmostEqual(result.query_id,
                                        query_id,
                                        delta=1e-9)
                 self.assertAlmostEqual(result.value, value, delta=1e-9)
     self.assertAlmostEqual(provider.calc_aggregate([Compat(p=0.95)], qrels,
                                                    run)[Compat(p=0.95)],
                            0.268097611,
                            delta=1e-9)
예제 #12
0
    def test_ERR_IA(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 1 D2 1
0 1 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 1 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.pyndeval
        measure = ir_measures.ERR_IA @ 20
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.4659, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertAlmostEqual(result[1].value, 0.1803, places=4)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.3231,
                               places=4)
예제 #13
0
    def test_IPrec(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 1
0 0 D1 1
0 0 D2 2
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.IPrec @ 0.25
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 1.0)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.625)

        measure = ir_measures.IPrec @ 0.5
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 1.0)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.625)

        measure = ir_measures.IPrec @ 0.75
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 1.0)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0.5)

        measure = ir_measures.IPrec(rel=2) @ 0.1
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, .6666666, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.4583,
                               places=4)

        measure = ir_measures.IPrec(rel=2) @ 0.25
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, .6666666, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.4583,
                               places=4)

        measure = ir_measures.IPrec(rel=2) @ 0.5
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, .6666666, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.4583,
                               places=4)

        measure = ir_measures.IPrec(rel=2) @ 0.75
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, .6666666, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .25)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.4583,
                               places=4)
예제 #14
0
    def test_P(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
'''))
        measure = ir_measures.P @ 5
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.P(rel=2) @ 5
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.RR
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.RR(rel=2)
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Rprec
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Rprec(rel=2)
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.AP
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.AP @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.AP(rel=2)
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.AP(rel=2) @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.nDCG
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.nDCG @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.R @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.R(rel=2) @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Bpref
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Bpref(rel=2)
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.P(rel=(1, 2)) @ (1, 5, 10, 20)
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Judged @ 5
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.Judged @ 20
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.ERR @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.ERR @ 20
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.nDCG(dcg='exp-log2') @ 2
        result = list(measure.iter_calc(qrels, run))
        print(result)
        measure = ir_measures.nDCG(dcg='exp-log2') @ 5
        result = list(measure.iter_calc(qrels, run))
        print(result)
예제 #15
0
    def test_P(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
'''))
        measure = ir_measures.P @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6))
        measure = ir_measures.P(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.2))

        measure = ir_measures.SetP
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6))
        measure = ir_measures.SetP(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.2))

        measure = ir_measures.R @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.R(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.R @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.R(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.0))

        measure = ir_measures.SetR
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))
        measure = ir_measures.SetR(rel=2) @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.0))

        measure = ir_measures.RR
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.RR @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.RR @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.5))
        measure = ir_measures.RR(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))

        measure = ir_measures.AP
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888))
        measure = ir_measures.AP(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.AP @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6388888888888888))
        measure = ir_measures.AP(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.3333333333333333))
        measure = ir_measures.AP @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.16666666666666666))
        measure = ir_measures.AP(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.0))

        measure = ir_measures.Success @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success(rel=2) @ 10
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 1.))
        measure = ir_measures.Success(rel=2) @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))

        measure = ir_measures.NumRet(rel=1)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 3.))
        measure = ir_measures.NumRet(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))

        measure = ir_measures.nDCG
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067))
        measure = ir_measures.nDCG @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6584645692843067))
        measure = ir_measures.nDCG @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.23981246656813146))

        measure = ir_measures.nDCG(dcg='exp-log2')
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453))
        measure = ir_measures.nDCG(dcg='exp-log2') @ 5
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6201040599710453))
        measure = ir_measures.nDCG(dcg='exp-log2') @ 2
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.17376534287144002))

        measure = ir_measures.Rprec
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.6666666666666666))
        measure = ir_measures.Rprec(rel=2)
        result = list(ir_measures.ranx.iter_calc([measure], qrels, run))
        self.assertEqual(result[0], Metric('0', measure, 0.))
예제 #16
0
    def test_empty(self):
        qrels = list(ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 2
'''))
        partial_qrels = [q for q in qrels if q.query_id == '0']
        run = list(ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        partial_run = [r for r in run if r.query_id == '0']
        empty = []

        # qrels but no run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set())

        # qrels but partial run
        self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)})

        # run but no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), [])

        # run but partial qrels
        self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)})
        self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)})
        self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)})
        self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)})
        self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)})
        self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)})
        self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)})

        # both no run and no qrels
        self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), [])
        self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), [])
        self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), [])

        # qrels but no run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')})

        # qrels but partial run
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5})

        # run but no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')})

        # run but partial qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5})

        # both no run and no qrels
        numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')})
        numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')})
        numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
예제 #17
0
    def test_define(self):
        def my_p(qrels, run):
            run = run.merge(qrels, 'left', on=['query_id', 'doc_id'])
            for qid, df in run.groupby('query_id'):
                yield qid, (df['relevance'] > 0).sum() / len(df)

        def my_s(qrels, run):
            run = run.merge(qrels, 'left', on=['query_id', 'doc_id'])
            for qid, df in run.groupby('query_id'):
                yield qid, 1. if (df['relevance'] > 0).sum() else 0.

        MyP = ir_measures.define(my_p)
        MyS = ir_measures.define(my_s)
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 1
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 0
1 0 D5 2
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D3 2 0.7 run
1 0 D4 3 0.3 run
1 0 D2 4 0.4 run
'''))
        result = list((MyP @ 1).iter_calc(qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.)
        self.assertEqual((MyP @ 1).calc_aggregate(qrels, run), 0.0)

        result = list((MyP @ 2).iter_calc(qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.5)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.)
        self.assertEqual((MyP @ 2).calc_aggregate(qrels, run), 0.25)

        result = list((MyP @ 3).iter_calc(qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0.6666666666666666)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.)
        self.assertEqual((MyP @ 3).calc_aggregate(qrels, run),
                         0.3333333333333333)

        result = list((MyS @ 2).iter_calc(qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 1.)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.)
        self.assertEqual((MyS @ 2).calc_aggregate(qrels, run), 0.5)
예제 #18
0
    def test_SetF(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 0
0 0 D1 1
0 0 D2 2
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.SetF(rel=1)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.75, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertAlmostEqual(result[1].value, .33333, places=4)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.5417,
                               places=4)

        measure = ir_measures.SetF(rel=1, beta=0.5)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.6923, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.3)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.49615,
                               places=4)

        measure = ir_measures.SetF(rel=1, beta=2.0)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.81818, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.375)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.59659,
                               places=4)

        measure = ir_measures.SetF(rel=3)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertEqual(result[0].value, 0)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0)
        self.assertEqual(
            provider.calc_aggregate([measure], qrels, run)[measure], 0)

        # make sure the multiple invocations hapen correctly
        res = provider.calc_aggregate([
            ir_measures.SetF(rel=1),
            ir_measures.SetF(rel=1, beta=0.5),
            ir_measures.SetF(rel=1, beta=2.0),
            ir_measures.SetF(rel=3)
        ], qrels, run)
        self.assertAlmostEqual(res[ir_measures.SetF(rel=1)], 0.5417, places=4)
        self.assertAlmostEqual(res[ir_measures.SetF(rel=1, beta=0.5)],
                               0.49615,
                               places=4)
        self.assertAlmostEqual(res[ir_measures.SetF(rel=1, beta=2.0)],
                               0.59659,
                               places=4)
        self.assertEqual(res[ir_measures.SetF(rel=3)], 0)
예제 #19
0
    def test_infAP(self):
        qrels = list(
            ir_measures.read_trec_qrels('''
0 0 D0 1
0 0 D1 -1
0 0 D2 0
0 0 D3 2
0 0 D4 0
1 0 D0 1
1 0 D3 2
1 0 D4 -1
1 0 D5 0
'''))
        run = list(
            ir_measures.read_trec_run('''
0 0 D0 1 0.8 run
0 0 D2 2 0.7 run
0 0 D1 3 0.3 run
0 0 D3 4 0.4 run
0 0 D4 5 0.1 run
1 0 D1 1 0.8 run
1 0 D4 2 0.7 run
1 0 D3 3 0.3 run
1 0 D2 4 0.4 run
'''))
        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.AP
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.8333, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, .125)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.47916666666666663,
                               places=4)

        measure = ir_measures.infAP
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.8333, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertEqual(result[1].value, 0.1875)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.510416,
                               places=4)

        provider = ir_measures.providers.PytrecEvalProvider()
        measure = ir_measures.AP(rel=2)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.33333, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertAlmostEqual(result[1].value, 0.25, places=4)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.2917,
                               places=4)

        measure = ir_measures.infAP(rel=2)
        result = list(provider.iter_calc([measure], qrels, run))
        self.assertEqual(result[0].query_id, "0")
        self.assertAlmostEqual(result[0].value, 0.33333, places=4)
        self.assertEqual(result[1].query_id, "1")
        self.assertAlmostEqual(result[1].value, 0.375, places=4)
        self.assertAlmostEqual(provider.calc_aggregate([measure], qrels,
                                                       run)[measure],
                               0.3542,
                               places=4)
예제 #20
0
 def test_measures(self):
     qrels = list(
         ir_measures.read_trec_qrels(
             os.path.join(os.path.dirname(__file__), 'cwl.qrels')))
     run = list(
         ir_measures.read_trec_run(
             os.path.join(os.path.dirname(__file__), 'cwl.run')))
     provider = ir_measures.cwl_eval
     # based on a manual execution of cwl-eval
     expected_results = [
         [AP, [('T1', 0.7087), ('T2', 0.7438), ('T3', 0.3068)]],
         [
             BPM(T=1.0, max_rel=1) @ 20,
             [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)]
         ],
         [
             BPM(T=2.0, max_rel=1) @ 10,
             [('T1', 0.6667), ('T2', 1.0000), ('T3', 0.2857)]
         ],
         [
             INSQ(T=1.0, max_rel=1),
             [('T1', 0.5872), ('T2', 0.6629), ('T3', 0.0880)]
         ],
         [
             INSQ(T=2.0, max_rel=1),
             [('T1', 0.4513), ('T2', 0.5068), ('T3', 0.1292)]
         ],
         [
             INST(T=1.0, max_rel=1),
             [('T1', 0.7934), ('T2', 0.9226), ('T3', 0.0888)]
         ],
         [
             INST(T=2.0, max_rel=1),
             [('T1', 0.5994), ('T2', 0.6924), ('T3', 0.1397)]
         ],
         [
             SDCG(max_rel=1) @ 10,
             [('T1', 0.5645), ('T2', 0.6531), ('T3', 0.2848)]
         ],
         [
             NERR8(max_rel=1) @ 10,
             [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)]
         ],
         [
             NERR9(max_rel=1) @ 10,
             [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0680)]
         ],
         [
             NERR10(p=0.8, max_rel=1),
             [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0888)]
         ],
         [
             NERR11(T=2.0, max_rel=1),
             [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0691)]
         ],
         [P @ 1, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.0000)]],
         [P @ 10, [('T1', 0.5000), ('T2', 0.6000), ('T3', 0.4000)]],
         [P @ 20, [('T1', 0.2500), ('T2', 0.3000), ('T3', 0.2000)]],
         [P @ 5, [('T1', 0.6000), ('T2', 0.6000), ('T3', 0.0000)]],
         [
             RBP(p=0.9, rel=1),
             [('T1', 0.3501), ('T2', 0.3996), ('T3', 0.1988)]
         ],
         [RR, [('T1', 1.0000), ('T2', 1.0000), ('T3', 0.1667)]],
     ]
     for measure, expected in expected_results:
         with self.subTest(measure=measure):
             self.assertTrue(provider.supports(measure))
             results = list(provider.iter_calc([measure], qrels, run))
             for result, (query_id, value) in zip(results, expected):
                 self.assertAlmostEqual(result.query_id,
                                        query_id,
                                        delta=0.0001)
                 self.assertAlmostEqual(result.value, value, delta=0.0001)