def _query_differences(self, run1, run2, *args, **kwargs): """ :param run1: TREC run. Has the format {qid: {docid: score}, ...} :param run2: Same as above :param args: :param kwargs: Expects a 'dataset' parameter. This is an instance of ir-datasets :return: A list of qids that differ the most in the metric """ assert "dataset" in kwargs, "Dataset object not supplied for qrel measure" dataset = kwargs["dataset"] assert dataset.has_qrels( ), "Dataset object does not have the qrels files" overlapping_keys = set(run1.keys()).intersection(set(run2.keys())) run1 = { qid: doc_id_to_score for qid, doc_id_to_score in run1.items() if qid in overlapping_keys } run2 = { qid: doc_id_to_score for qid, doc_id_to_score in run2.items() if qid in overlapping_keys } qrels = dataset.qrels_dict() try: metric = parse_measure(self.metric) except NameError: print( "Unknown measure: {}. Please provide a measure supported by https://ir-measur.es/" .format(self.metric)) sys.exit(1) topk = self.topk eval_run_1 = self.convert_to_nested_dict( iter_calc([metric], qrels, run1)) eval_run_2 = self.convert_to_nested_dict( iter_calc([metric], qrels, run2)) query_ids = eval_run_1.keys() & eval_run_2.keys() query_ids = sorted( query_ids, key=lambda x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]), reverse=True) query_ids = query_ids[:topk] id2diff = { x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]) for x in query_ids } id2qrelscores = { x: [eval_run_1[x][metric], eval_run_2[x][metric]] for x in query_ids } return query_ids, id2diff, self.metric, id2qrelscores
def calc_metrics(self, qrels, run, metrics, verbose=False): measures = {ir_measures.parse_measure(str(m)): str(m) for m in metrics} results = {} for metric in ir_measures.iter_calc(list(measures), qrels, run): measure = measures[metric.measure] if measure not in results: results[measure] = {} results[measure][metric.query_id] = metric.value return results
def iter_calc(self, qrels, run): self.validate_params() return ir_measures.iter_calc([self], qrels, run)
def test_empty(self): qrels = list(ir_measures.read_trec_qrels(''' 0 0 D0 0 0 0 D1 1 0 0 D2 1 0 0 D3 2 0 0 D4 0 1 0 D0 1 1 0 D3 2 1 0 D5 2 ''')) partial_qrels = [q for q in qrels if q.query_id == '0'] run = list(ir_measures.read_trec_run(''' 0 0 D0 1 0.8 run 0 0 D2 2 0.7 run 0 0 D1 3 0.3 run 0 0 D3 4 0.4 run 0 0 D4 5 0.1 run 1 0 D1 1 0.8 run 1 0 D3 2 0.7 run 1 0 D4 3 0.3 run 1 0 D2 4 0.4 run ''')) partial_run = [r for r in run if r.query_id == '0'] empty = [] # qrels but no run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, empty)), {Metric('0', ERR@5, 0.), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, empty)), {Metric('0', Judged@5, 0.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, empty)), {Metric('0', RR@5, 0.), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, empty)), {Metric('0', P@5, 0.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, empty)), {Metric('0', Compat(p=0.8), 0.0), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, empty)), set()) # qrels but partial run self.assertEqual(set(ir_measures.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], qrels, partial_run)), {Metric('0', ERR@5, 0.10175), Metric('1', ERR@5, 0.)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], qrels, partial_run)), {Metric('0', Judged@5, 1.), Metric('1', Judged@5, 0.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], qrels, partial_run)), {Metric('0', RR@5, 0.5), Metric('1', RR@5, 0.)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], qrels, partial_run)), {Metric('0', P@5, 0.6), Metric('1', P@5, 0.)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], qrels, partial_run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0), Metric('1', P@5, 0.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], qrels, partial_run)), {Metric('0', Compat(p=0.8), 0.4744431703672816), Metric('1', Compat(p=0.8), 0.0)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], qrels, partial_run)), {Metric('0', Accuracy(), 0.5)}) # run but no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, run)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, run)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, run)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, run)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, run)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, run)), []) # run but partial qrels self.assertEqual(set(ir_measures.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.gdeval.iter_calc([ERR@5], partial_qrels, run)), {Metric('0', ERR@5, 0.10175)}) self.assertEqual(set(ir_measures.judged.iter_calc([Judged@5], partial_qrels, run)), {Metric('0', Judged@5, 1.)}) self.assertEqual(set(ir_measures.msmarco.iter_calc([RR@5], partial_qrels, run)), {Metric('0', RR@5, 0.5)}) self.assertEqual(set(ir_measures.pytrec_eval.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.trectools.iter_calc([P@5], partial_qrels, run)), {Metric('0', P@5, 0.6)}) self.assertEqual(set(ir_measures.cwl_eval.iter_calc([P@5], partial_qrels, run)), {CwlMetric('0', P@5, 0.6000000000000001, 3.0, 1.0, 5.0, 5.0)}) self.assertEqual(set(ir_measures.compat.iter_calc([Compat(p=0.8)], partial_qrels, run)), {Metric('0', Compat(p=0.8), 0.4744431703672816)}) self.assertEqual(set(ir_measures.accuracy.iter_calc([Accuracy()], partial_qrels, run)), {Metric('0', Accuracy(), 0.5)}) # both no run and no qrels self.assertEqual(list(ir_measures.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.gdeval.iter_calc([ERR@5], empty, empty)), []) self.assertEqual(list(ir_measures.judged.iter_calc([Judged@5], empty, empty)), []) self.assertEqual(list(ir_measures.msmarco.iter_calc([RR@5], empty, empty)), []) self.assertEqual(list(ir_measures.pytrec_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.trectools.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.cwl_eval.iter_calc([P@5], empty, empty)), []) self.assertEqual(list(ir_measures.compat.iter_calc([Compat(p=0.8)], empty, empty)), []) self.assertEqual(list(ir_measures.accuracy.iter_calc([Accuracy()], empty, empty)), []) # qrels but no run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, empty), {ERR@5: 0.}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, empty), {Judged@5: 0.}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, empty), {RR@5: 0.}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, empty), {P@5: 0.}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, empty), {Compat(p=0.8): 0.}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, empty), {Accuracy(): float('NaN')}) # qrels but partial run numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], qrels, partial_run), {ERR@5: 0.050875}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], qrels, partial_run), {Judged@5: 0.5}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], qrels, partial_run), {RR@5: 0.25}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.3}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], qrels, partial_run), {P@5: 0.30000000000000004}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], qrels, partial_run), {Compat(p=0.8): 0.2372215851836408}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], qrels, partial_run), {Accuracy(): 0.5}) # run but no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, run), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, run), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, run), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, run), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, run), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, run), {Accuracy(): float('NaN')}) # run but partial qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], partial_qrels, run), {ERR@5: 0.10175}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], partial_qrels, run), {Judged@5: 1.0}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], partial_qrels, run), {RR@5: 0.5}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], partial_qrels, run), {P@5: 0.6000000000000001}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], partial_qrels, run), {Compat(p=0.8): 0.4744431703672816}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], partial_qrels, run), {Accuracy(): 0.5}) # both no run and no qrels numpy.testing.assert_equal(ir_measures.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.gdeval.calc_aggregate([ERR@5], empty, empty), {ERR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.judged.calc_aggregate([Judged@5], empty, empty), {Judged@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.msmarco.calc_aggregate([RR@5], empty, empty), {RR@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.pytrec_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.trectools.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.cwl_eval.calc_aggregate([P@5], empty, empty), {P@5: float('NaN')}) numpy.testing.assert_equal(ir_measures.compat.calc_aggregate([Compat(p=0.8)], empty, empty), {Compat(p=0.8): float('NaN')}) numpy.testing.assert_equal(ir_measures.accuracy.calc_aggregate([Accuracy()], empty, empty), {Accuracy(): float('NaN')})
def _run_and_evaluate(system: SYSTEM_OR_RESULTS_TYPE, topics: pd.DataFrame, qrels: pd.DataFrame, metrics: MEASURES_TYPE, pbar=None, save_mode=None, save_file=None, perquery: bool = False, batch_size=None, backfill_qids: Sequence[str] = None): from .io import read_results, write_results if pbar is None: from . import tqdm pbar = tqdm(disable=True) metrics, rev_mapping = _convert_measures(metrics) qrels = qrels.rename(columns={ 'qid': 'query_id', 'docno': 'doc_id', 'label': 'relevance' }) from timeit import default_timer as timer runtime = 0 num_q = qrels['query_id'].nunique() if save_file is not None and os.path.exists(save_file): if save_mode == "reuse": system = read_results(save_file) elif save_mode == "overwrite": os.remove(save_file) else: raise ValueError( "Unknown save_file argument '%s', valid options are 'reuse' or 'overwrite'" % save_mode) # if its a DataFrame, use it as the results if isinstance(system, pd.DataFrame): res = system res = coerce_dataframe_types(res) if len(res) == 0: raise ValueError("%d topics, but no results in dataframe" % len(topics)) evalMeasuresDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, perquery, backfill_qids) pbar.update() elif batch_size is None: #transformer, evaluate all queries at once starttime = timer() res = system.transform(topics) endtime = timer() runtime = (endtime - starttime) * 1000. # write results to save_file; we can be sure this file does not exist if save_file is not None: write_results(res, save_file) res = coerce_dataframe_types(res) if len(res) == 0: raise ValueError("%d topics, but no results received from %s" % (len(topics), str(system))) evalMeasuresDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, perquery, backfill_qids) pbar.update() else: #transformer, evaluate queries in batches assert batch_size > 0 starttime = timer() evalMeasuresDict = {} remaining_qrel_qids = set(qrels.query_id) try: for i, (res, batch_topics) in enumerate( system.transform_gen(topics, batch_size=batch_size, output_topics=True)): if len(res) == 0: raise ValueError( "batch of %d topics, but no results received in batch %d from %s" % (len(batch_topics), i, str(system))) endtime = timer() runtime += (endtime - starttime) * 1000. # write results to save_file; we will append for subsequent batches if save_file is not None: write_results(res, save_file, append=True) res = coerce_dataframe_types(res) batch_qids = set(batch_topics.qid) batch_qrels = qrels[qrels.query_id.isin( batch_qids )] # filter qrels down to just the qids that appear in this batch remaining_qrel_qids.difference_update(batch_qids) batch_backfill = [ qid for qid in backfill_qids if qid in batch_qids ] if backfill_qids is not None else None evalMeasuresDict.update( _ir_measures_to_dict(ir_measures.iter_calc( metrics, batch_qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, perquery=True, backfill_qids=batch_backfill)) pbar.update() starttime = timer() except: # if an error is thrown, we need to clean up our existing file if save_file is not None and os.path.exists(save_file): os.remove(save_file) raise if remaining_qrel_qids: # there are some qids in the qrels that were not in the topics. Get the default values for these and update evalMeasuresDict missing_qrels = qrels[qrels.query_id.isin(remaining_qrel_qids)] empty_res = pd.DataFrame([], columns=['query_id', 'doc_id', 'score']) evalMeasuresDict.update( _ir_measures_to_dict(ir_measures.iter_calc( metrics, missing_qrels, empty_res), metrics, rev_mapping, num_q, perquery=True)) if not perquery: # aggregate measures if not in per query mode aggregators = { rev_mapping.get(m, str(m)): m.aggregator() for m in metrics } for q in evalMeasuresDict: for metric in metrics: s_metric = rev_mapping.get(metric, str(metric)) aggregators[s_metric].add(evalMeasuresDict[q][s_metric]) evalMeasuresDict = { m: agg.result() for m, agg in aggregators.items() } return (runtime, evalMeasuresDict)
def _run_and_evaluate(system: SYSTEM_OR_RESULTS_TYPE, topics: pd.DataFrame, qrels: pd.DataFrame, metrics: MEASURES_TYPE, perquery: bool = False, batch_size=None): metrics, rev_mapping = _convert_measures(metrics) qrels = qrels.rename(columns={ 'qid': 'query_id', 'docno': 'doc_id', 'label': 'relevance' }) from timeit import default_timer as timer runtime = 0 num_q = qrels['query_id'].nunique() # if its a DataFrame, use it as the results if isinstance(system, pd.DataFrame): res = system if len(res) == 0: raise ValueError("%d topics, but no results in dataframe" % len(topics)) evalMeasuresDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, perquery) elif batch_size is None: #transformer, evaluate all queries at once starttime = timer() res = system.transform(topics) endtime = timer() runtime = (endtime - starttime) * 1000. if len(res) == 0: raise ValueError("%d topics, but no results received from %s" % (len(topics), str(system))) evalMeasuresDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, perquery) else: #transformer, evaluate queries in batches assert batch_size > 0 starttime = timer() results = [] evalMeasuresDict = {} for i, res in enumerate( system.transform_gen(topics, batch_size=batch_size)): if len(res) == 0: raise ValueError( "batch of %d topics, but no results received in batch %d from %s" % (batch_size, i, str(system))) endtime = timer() runtime += (endtime - starttime) * 1000. localEvalDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, rev_mapping, num_q, True) evalMeasuresDict.update(localEvalDict) starttime = timer() if not perquery: aggregators = { rev_mapping.get(m, str(m)): m.aggregator() for m in metrics } for q in evalMeasuresDict: for metric in metrics: s_metric = rev_mapping.get(metric, str(metric)) aggregators[s_metric].add(evalMeasuresDict[q][s_metric]) evalMeasuresDict = { m: agg.result() for m, agg in aggregators.items() } return (runtime, evalMeasuresDict)
def create_query_objects(self, run_1, run_2, qids, qid2diff, metric_name, dataset, qid2qrelscores=None): """ TODO: Need a better name This method takes in 2 runs and a set of qids, and constructs a dict for each qid (format specified below) :param: run_1: TREC run of the format {qid: {docid: score}, ...} :param: run_2: TREC run of the format {qid: {docid: score}, ...} :param qids: A list of qids (strings) :param dataset: Instance of an ir-datasets object :return: A list of dicts. Each dict has the following format: { "fields": {"query_id": "qid", "title": "Title query", "desc": "Can be empty", ... everything else in ir-dataset query}, "run_1": [ { "doc_id": "id of the doc", "score": <score>, "relevance": <comes from qrels>, "weights": [ [field, start, stop, weight] ^ Need more clarity. Return an empty list for now ] } ], "run_2": <same format as run 1> } """ assert dataset.has_qrels( ), "Cannot determine whether the doc is relevant - need qrels" qrels = dataset.qrels_dict() run1_metrics = defaultdict(lambda: defaultdict(lambda: None)) for metrics in iter_calc([ P @ 1, P @ 3, P @ 5, P @ 10, nDCG @ 1, nDCG @ 3, nDCG @ 5, nDCG @ 10 ], qrels, run_1): run1_metrics[metrics.query_id][str( metrics.measure)] = metrics.value if run_2: run2_metrics = defaultdict(lambda: defaultdict(lambda: None)) for metrics in iter_calc([ P @ 1, P @ 3, P @ 5, P @ 10, nDCG @ 1, nDCG @ 3, nDCG @ 5, nDCG @ 10 ], qrels, run_2): run2_metrics[metrics.query_id][str( metrics.measure)] = metrics.value docstore = dataset.docs_store() qids_set = set(qids) # Sets do O(1) lookups qid2object = {} for query in tqdm(dataset.queries_iter(), desc="analyzing queries"): if query.query_id not in qids_set: continue RESULT_COUNT = 10 doc_ids = (set( list(run_1[query.query_id])[:RESULT_COUNT] + list(run_2[query.query_id])[:RESULT_COUNT]) if run_2 else list( run_1[query.query_id])[:RESULT_COUNT]) fields = query._asdict() fields["contrast"] = { "name": metric_name, "value": qid2diff[query.query_id] } if qid2qrelscores: fields[f"Run1 {metric_name}"] = qid2qrelscores[ query.query_id][0] fields[f"Run2 {metric_name}"] = qid2qrelscores[ query.query_id][1] qrels_for_query = qrels.get(query.query_id, {}) run_1_for_query = [] for rank, (doc_id, score) in enumerate(run_1[query.query_id].items()): if doc_id not in doc_ids: continue doc = docstore.get(doc_id) weights = self.weight.score_document_regions(query, doc, 0) run_1_for_query.append({ "doc_id": doc_id, "score": score, "relevance": qrels_for_query.get(doc_id), "rank": rank + 1, "weights": weights, "snippet": self.find_snippet(weights, doc), }) run_2_for_query = [] if run_2 is not None: for rank, (doc_id, score) in enumerate(run_2[query.query_id].items()): if doc_id not in doc_ids: continue doc = docstore.get(doc_id) weights = self.weight.score_document_regions(query, doc, 1) run_2_for_query.append({ "doc_id": doc_id, "score": score, "relevance": qrels_for_query.get(doc_id), "rank": rank + 1, "weights": weights, "snippet": self.find_snippet(weights, doc), }) qid2object[query.query_id] = { "fields": fields, "metrics": { metric: [ run1_metrics[query.query_id][metric], run2_metrics[query.query_id][metric] ] if run_2 else [run1_metrics[query.query_id][metric]] for metric in [ "P@1", "P@3", "P@5", "P@10", "nDCG@1", "nDCG@3", "nDCG@5", "nDCG@10" ] }, "run_1": run_1_for_query, "run_2": run_2_for_query, "summary": self.create_summary(run_1_for_query, run_2_for_query), "mergedWeights": self.merge_weights(run_1_for_query, run_2_for_query), } return [qid2object[id] for id in qids]