def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')): """Get metrics (ndcg and map by default) for a run compared to a qrel file. Arguments: qrel_file -- qrel file with ground truth data run_file -- predictions from the run metrics -- which metrics to evaluate on, can use any valid metrics that the trec_eval tool accepts Returns: metric_values -- dictionary of metric values (out of 100), rounded to two decimal places """ with open(qrel_file, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(run_file, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics)) results = evaluator.evaluate(run) metric_values = {} for measure in sorted(metrics): res = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()] ) metric_values[measure] = np.round(100 * res, 2) return metric_values
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10', split: dict = None, split_idx: int = -1) -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) # partial evaluation if split is not None and split_idx >= 0: for qid in copy.deepcopy(run): if qid not in split[split_idx]: _ = run.pop(qid) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def show(self, metrics): result = {} for metric in metrics: res = pytrec_eval.compute_aggregated_measure(metric, [user[metric] for user in self.result.values()]) result[metric] = res # print('{}={}'.format(metric, res)) return result
def cal_ndcg(qrels, trec, k): with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) metric = 'ndcg_cut_%d' % k if metric not in mes: print('Depth of NDCG not available.') exit() ndcg = mes[metric] return ndcg
def perform_reranking(run, qfield, queries, docnos, doc_embs, word_dict, word_embs, sweep, normalizer, ref_measure, evaluator): """perform re-ranking of input run w/ semantic model""" # loop over weight values with sweep equal to sweep for weight in np.arange(0.0, 1.0, sweep): # generate combined run with current weight combined_run = compute_combined_run(run, qfield, queries, docnos, doc_embs, word_dict, word_embs, normalizer, weight) # evaluate combined run results = evaluator.evaluate(combined_run) # compute aggregated measure score agg_measure_score = pytrec_eval.compute_aggregated_measure( ref_measure, [qscore[ref_measure] for qscore in results.values()]) # return aggregated mesure score and weight yield agg_measure_score, weight
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. for measure in sorted(query_measures.keys()): print_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]))
def evaluate(self, metrics): if platform.system().lower().startswith("win"): print("Cannot evaluate result, windows platform.") self.final_measures = dict({"P_20": 0.1}) return self.final_measures evaluator = pytrec_eval.RelevanceEvaluator(self.qrels, set(self.all_metrics)) results = evaluator.evaluate(self.predicted_qrels) final_measures = dict() for measure in metrics: final_measures[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ], ) self.final_measures = final_measures return self.final_measures
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10') -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def evaluate(qrels_df, run_df, aggregated_measures={ 'recall_1000': '', 'ndcg': '', 'Rprec': '', 'P_10': '' }): MEASURES_AGGREGATED = aggregated_measures evaluator = pytrec_eval.RelevanceEvaluator( utils.qrels_to_pytrec_eval(qrels_df), utils.pytrec_eval.supported_measures) results = evaluator.evaluate(utils.run_to_pytrec_eval(run_df)) for measure in MEASURES_AGGREGATED.keys(): measure_all = pytrec_eval.compute_aggregated_measure( measure, [ MEASURES_AGGREGATED[measure] for MEASURES_AGGREGATED in results.values() ]) MEASURES_AGGREGATED[measure] = round(measure_all, 4) return (results, MEASURES_AGGREGATED)
def __test(self): with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \ f_trec_eval: trec_eval_output = parse_trec_eval(f_trec_eval) measures = set( measure if measure in pytrec_eval.supported_measures else prefix_match(measure, pytrec_eval.supported_measures) for measure in trec_eval_output['all'].keys()) with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs) results = evaluator.evaluate(run) expected_measures = trec_eval_output['all'] for measure in expected_measures: agg_measure_value = pytrec_eval.compute_aggregated_measure( measure, [ query_measure_values[measure] for query_measure_values in results.values() ]) ground_truth_agg_measure_value = \ trec_eval_output['all'][measure] self.assertAlmostEqual(agg_measure_value, ground_truth_agg_measure_value, places=3, msg=measure)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # parse and store qrels if FLAGS.qrels_fname: with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision else: print("please provide qrels filename") return False """ LEXICAL PREPROCESSING """ # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) """ SEMANTIC PREPROCESSING """ # load required data print( 'load processed data required to perform re-ranking over lexical model w/ semantic model' ) with open(data_folder + '/docs.json', 'r') as cf: corpus = json.load(cf) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dictionary reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) # compute doc embeddings doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) """ COMPUTE RE-RANKING """ # set random seed np.random.seed(FLAGS.seed) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print('fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [qscore[FLAGS.ref_measure] for qscore in test_res.values()]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name index_folder = 'corpus/' + FLAGS.corpus_name + '/index' # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) # if not os.path.exists(model_folder): # os.makedirs(model_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # set random seed - enable reproducibility np.random.seed(FLAGS.seed) # establish connection with UMLS db umls_lookup = umls.UMLSLookup() # load required data print( 'load processed data required to retrofit word vectors and perform retrieval tasks' ) with open(data_folder + '/docs.json', 'r') as df: corpus = json.load(df) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dict reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # pre process relational data if not os.path.exists(data_folder + '/term2cui.json'): # map terms to cuis using QuickUMLS term2cui = tf_utils.get_term2cui(word_dict, data_folder, threshold=FLAGS.threshold, stypes_fname=FLAGS.stypes_fname) else: # laod (term, cui) pairs print('load (term, cui) pairs') with open(data_folder + '/term2cui.json', 'r') as tcf: term2cui = json.load(tcf) """ SEMANTIC PROCESSING """ # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) """ RETROFITTING """ if FLAGS.retrofit: # get synonyms for each word within vocabulary print('get synonyms') syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup) if FLAGS.syn_weights: # convert collection frequencies from list to dict cfs = dict(cfs) else: cfs = None # retrofit word vectors print('retrofit word vectors for {} iterations'.format( FLAGS.iterations)) word_embs = retrofit(word_embs, syns, reverse_word_dict, FLAGS.iterations, alpha=1.0, beta=FLAGS.beta, cfs=cfs) # compute doc embeddings print('compute document vectors w/ retrofitted word vectors') doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) if not FLAGS.reranking: """ RETRIEVAL """ print('perform retrieval over the entire collection') # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # set query embs and ids q_embs = [] q_ids = [] # loop over queries and generate rankings for qid, qtext in q.items(): # prepare queries for semantic matching q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict, word_embs) if q_proj is None: print('query {} does not contain known terms'.format(qid)) else: q_embs.append(q_proj) q_ids.append(qid) q_embs = np.array(q_embs) # perform search and evaluate model effectiveness tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs, rankings_folder, FLAGS.model_name) scores = tf_utils.evaluate( ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder, FLAGS.model_name, qrels_folder, FLAGS.qrels_fname) else: """ RE-RANKING """ print('perform re-ranking over top 1000 documents from a baseline run') # parse and store qrels with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking( run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print( 'fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [ qscore[FLAGS.ref_measure] for qscore in test_res.values() ]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def eval_ranking_pred(label_file, pred_file): # # load directory structure # labels = [] with jsonlines.open(label_file, mode='r') as reader: for file in reader: labels.append(file) label_dict = {} for label in labels: label_dict.update({label.get('guid'): label.get('label')}) print(len(labels)) print(len(label_dict)) with open(pred_file, 'r') as reader: content = reader.read().splitlines() predictions = [ast.literal_eval(file) for file in content] print(len(predictions)) pred_dict = {} pos = [] for pred in predictions: pred_dict.update({pred.get('guid'): max(pred.get('res'))+ 100 * np.argmax(pred.get('res')) + 100}) # für binary ist hier 1 anstatt 0 (für mseloss output) pos.append(min(pred.get('res'))) # für binary ist hier 1 anstatt 0 (für mseloss) print(min(pos)) assert abs(min(pos)) < 100 files = list(label_dict.keys()) files.sort() qrels = {} for file in files: qrels.update({file.split('_')[0]: {}}) for file in files: # print(file.split('_')[0]) qrels.get(file.split('_')[0]).update({file.split('_')[1]: label_dict.get(file)}) # qrels.update({file.split('_')[0]: {file.split('_')[1]: label_dict.get(file)}}) # label_dict.get(file) # print(qrels.get('001')) run = {} for file in files: run.update({file.split('_')[0]: {}}) for file in files: # print(file.split('_')[0]) if pred_dict.get(file): run.get(file.split('_')[0]).update({file.split('_')[1]: pred_dict.get(file)}) else: run.get(file.split('_')[0]).update({file.split('_')[1]: 0}) # trec eval evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'map', 'P_1', 'recall_1', 'P_2', 'recall_2'}) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) def write_line(measure, scope, value): return '{:25s}{:8s}{:.4f}'.format(measure, scope, value) for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. for measure in sorted(query_measures.keys()): print_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()])) with open(pred_file.split('.txt')[0] + '_eval_200_3.txt', 'w') as output: for measure in sorted(query_measures.keys()): output.write(write_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()])) + '\n')
def eval_ranking_bm25(label_file, bm25_folder): labels = [] with jsonlines.open(label_file, mode='r') as reader: for file in reader: labels.append(file) label_dict = {} for label in labels: label_dict.update({label.get('guid'): label.get('label')}) files = list(label_dict.keys()) files.sort() qrels = {} for file in files: qrels.update({file.split('_')[0]: {}}) for file in files: print(file.split('_')[0]) qrels.get(file.split('_')[0]).update({file.split('_')[1]: label_dict.get(file)}) # qrels.update({file.split('_')[0]: {file.split('_')[1]: label_dict.get(file)}}) # label_dict.get(file) run = {} for file in files: run.update({file.split('_')[0]: {}}) for key in list(run.keys()): with open(os.path.join(bm25_folder, 'bm25_top50_{}.txt'.format(key)), #.xml für clef-ip corpus 'r') as out: # (.xml) for clef-ip top 50, different splitting also! #text = [text.split('-')[0] + '-' + text.split('-')[1] for text in # [text.split('\n')[0].strip() for text in out.readlines()]] text = [text.split('_')[1] for text in [text.split('\n')[0].strip() for text in out.readlines()]] for file in files: if file.split('_')[1] in text: run.get(key).update( {text[text.index(file.split('_')[1])]: len(text) - text.index(file.split('_')[1])}) else: run.get(key).update({file.split('_')[1]: 0}) # trec eval evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'map', 'P_1', 'recall_1', 'P_2', 'recall_2'})#pytrec_eval.supported_measures results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) def write_line(measure, scope, value): return '{:25s}{:8s}{:.4f}'.format(measure, scope, value) for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. for measure in sorted(query_measures.keys()): print_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()])) with open(os.path.join(bm25_folder, 'eval_bm25_200_3.txt'), 'w') as output: for measure in sorted(query_measures.keys()): output.write(write_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()])) + '\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--qrel', default= '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt' ) parser.add_argument( '--run', default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt') args = parser.parse_args() print("args.qrel:", args.qrel) print("args.run", args.run) assert os.path.exists(args.qrel) assert os.path.exists(args.run) final_auc, final_accuracy = calc_auc(args.qrel, args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) total = len(results.items()) sum_map = 0.0 for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): # print_line(measure, query_id, value) pass # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. print("==========") selected_measures = [ 'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5', 'recall_10', 'recall_15', 'recall_20', 'ndcg' ] eva_values = {} for measure in selected_measures: eva_values[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) # print_line( measure, 'all', eva_values[measure]) for measure in selected_measures: print_line(measure, 'all', eva_values[measure]) print( "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (final_auc, final_accuracy, eva_values['map'], eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'], eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'], eva_values['recall_10'], eva_values['recall_15'], eva_values['recall_20'], eva_values['ndcg']))