def plotDifferenceWith(targetRun, otherRuns, qrels, measure, outputFile, style=PLOT_STYLE): avg_baseline, baseline_scores = pytrec_eval.evaluate( targetRun, qrels, measure, True) bar_chart = pygal.Bar() bar_chart.style = style allTopics = list(qrels.getTopicIds()) bar_chart.label_font_size = 8 bar_chart.legend_at_bottom = True bar_chart.legend_font_size = 10 bar_chart.legend_box_size = 8 bar_chart.x_label_rotation = 90 bar_chart.x_labels = allTopics bar_chart.x_title = 'Topic Id' bar_chart.y_title = 'Difference from ' + targetRun.name + ' (' + pytrec_eval.METRICS_NAMES[ measure] + ')' for otherRun in otherRuns: _, other_scores = pytrec_eval.evaluate(otherRun, qrels, measure, True) points = [ (other_scores[topicId] if topicId in other_scores else 0) - (baseline_scores[topicId] if topicId in baseline_scores else 0) for topicId in allTopics ] bar_chart.add(otherRun.name, points) bar_chart.render_to_file(outputFile)
def plotEvaluation(trecRun, qrels, measure, outputFile=None, showPlot=True): """ Plots an histogram with one bar per topic. Each bar represents the difference between measure computed on the topic and the average measure among all topics. If outputFile is a string specifying the name of a file, then the plot is saved into that file. If showPlot then the plot is shown to the user (but not necessarily stored into a file). """ plt.clf() avg, details = pytrec_eval.evaluate(trecRun, qrels, measure, True) # to be sure that qId order is the same of score order (maybe it's not necessary...) lstDetails = [(qId, score) for qId, score in details.items()] lstDetails.sort(key=lambda x: x[0]) qIds = [qId for qId, _ in lstDetails] scores = [score - avg for _, score in lstDetails] plt.figure(1) x = [i for i in range(len(qIds))] # np.arange(len(qIds)) plt.bar(x, scores, width=0.6) plt.xticks(x, qIds, rotation=90, size=5) plt.xlim(xmax=len(qIds)) plt.xlabel('Topic Id') plt.ylabel('Difference of ' + pytrec_eval.METRICS_NAMES[measure] + ' from Average') if showPlot: plt.show() if outputFile is not None: plt.savefig(outputFile, bbox_inches=0)
def plotEvaluationAll(trecRuns, qrels, measure, outputFile, style=PLOT_STYLE): """ Plots an histogram with one bar per topic. Each bar represents the difference between measure computed on the topic and the average measure among all topics. OutputFile is a string specifying the name of the file the plot is saved into. """ qIds = list(qrels.getTopicIds()) qIds.sort() bar_chart = pygal.Bar() # bar_chart.spacing = 50 bar_chart.label_font_size = 8 bar_chart.style = style bar_chart.x_label_rotation = 90 bar_chart.x_labels = qIds bar_chart.x_title = 'Topic Id' bar_chart.legend_at_bottom = True bar_chart.legend_font_size = 10 bar_chart.legend_box_size = 8 bar_chart.y_title = pytrec_eval.METRICS_NAMES[measure] for trecRun in trecRuns: avg, details = pytrec_eval.evaluate(trecRun, qrels, measure, True) lstDetails = [details[topicId] if topicId in details else 0 for topicId in qIds] bar_chart.add(trecRun.name, lstDetails) bar_chart.render_to_file(outputFile)
def plotEvaluation(trecRun, qrels, measure, outputFile, style=PLOT_STYLE): """ Plots an histogram with one bar per topic. Each bar represents the difference between measure computed on the topic and the average measure among all topics. OutputFile is a string specifying the name of the file the plot is saved into. """ avg, details = pytrec_eval.evaluate(trecRun, qrels, measure, True) # to be sure that qId order is the same of score order (maybe it's not necessary...) bar_chart = pygal.Bar() bar_chart.style = style lstDetails = [(qId, score) for qId, score in details.items()] lstDetails.sort(key=lambda x: x[0]) qIds = [qId for qId, _ in lstDetails] scores = [score for _, score in lstDetails] bar_chart.add(trecRun.name, scores) bar_chart.label_font_size = 8 bar_chart.legend_at_bottom = True bar_chart.legend_font_size = 10 bar_chart.legend_box_size = 8 bar_chart.x_label_rotation = 90 bar_chart.x_labels = qIds bar_chart.x_title = 'query ids' bar_chart.y_title = pytrec_eval.METRICS_NAMES[measure] bar_chart.render_to_file(outputFile)
def plotEvaluationAll(trecRuns, qrels, measure, outputFile, style=PLOT_STYLE): """ Plots an histogram with one bar per topic. Each bar represents the difference between measure computed on the topic and the average measure among all topics. OutputFile is a string specifying the name of the file the plot is saved into. """ qIds = list(qrels.getTopicIds()) qIds.sort() bar_chart = pygal.Bar() # bar_chart.spacing = 50 bar_chart.label_font_size = 8 bar_chart.style = style bar_chart.x_label_rotation = 90 bar_chart.x_labels = qIds bar_chart.x_title = 'Topic Id' bar_chart.legend_at_bottom = True bar_chart.legend_font_size = 10 bar_chart.legend_box_size = 8 bar_chart.y_title = pytrec_eval.METRICS_NAMES[measure] for trecRun in trecRuns: avg, details = pytrec_eval.evaluate(trecRun, qrels, measure, True) lstDetails = [ details[topicId] if topicId in details else 0 for topicId in qIds ] bar_chart.add(trecRun.name, lstDetails) bar_chart.render_to_file(outputFile)
def rankRuns(runs, qrels, measure): """Ranks the runs based on measure. Returns a list of pairs (run, score) ordered by score descending. """ rank = [ (run, pytrec_eval.evaluate(run, qrels, [measure])[0]) for run in runs ] rank.sort(key= lambda x : x[1], reverse=True) return rank
def ttest(victim_run, allTheOther_runs, qrels, metric): """ Computes ttest between victim_run and all runs contained in allTheOther_runs using relevance judgements contained in qrels to compute the specified metric. Returns a dictionary d[otherRunName] = p-value """ victimAvg, victimDetails = evaluation.evaluate(victim_run, qrels, metric, True) # to read the scores always in the same order keyList = list(victimDetails.keys()) victimScores = [ victimDetails[k] for k in keyList ] result = {} for othertrun in allTheOther_runs: otherAvg, otherDetails = evaluation.evaluate(othertrun, qrels, metric, True) otherScores = [otherDetails[k] for k in keyList] _, p = stats.ttest_ind(victimScores, otherScores) result[othertrun.name] = p return result
def rankRuns(runs, qrels, measure): """Ranks the runs based on measure. Returns a list of pairs (run, score) ordered by score descending. """ rank = [(run, pytrec_eval.evaluate(run, qrels, [measure])[0]) for run in runs] rank.sort(key=lambda x: x[1], reverse=True) return rank
def ttest(victim_run, allTheOther_runs, qrels, metric): """ Computes ttest between victim_run and all runs contained in allTheOther_runs using relevance judgements contained in qrels to compute the specified metric. Returns a dictionary d[otherRunName] = p-value """ victimAvg, victimDetails = evaluation.evaluate(victim_run, qrels, metric, True) # to read the scores always in the same order keyList = list(victimDetails.keys()) victimScores = [victimDetails[k] for k in keyList] result = {} for othertrun in allTheOther_runs: otherAvg, otherDetails = evaluation.evaluate(othertrun, qrels, metric, True) otherScores = [otherDetails[k] for k in keyList] _, p = stats.ttest_ind(victimScores, otherScores) result[othertrun.name] = p return result
def plotDifferenceWith(targetRun, otherRuns, qrels, measure, outputFile, style=PLOT_STYLE): avg_baseline, baseline_scores = pytrec_eval.evaluate(targetRun, qrels, measure, True) bar_chart = pygal.Bar() bar_chart.style = style allTopics = list(qrels.getTopicIds()) bar_chart.label_font_size = 8 bar_chart.legend_at_bottom = True bar_chart.legend_font_size = 10 bar_chart.legend_box_size = 8 bar_chart.x_label_rotation = 90 bar_chart.x_labels = allTopics bar_chart.x_title = 'Topic Id' bar_chart.y_title = 'Difference from ' + targetRun.name + ' (' + pytrec_eval.METRICS_NAMES[measure] + ')' for otherRun in otherRuns: _, other_scores = pytrec_eval.evaluate(otherRun, qrels, measure, True) points = [(other_scores[topicId] if topicId in other_scores else 0) - (baseline_scores[topicId] if topicId in baseline_scores else 0) for topicId in allTopics] bar_chart.add(otherRun.name, points) bar_chart.render_to_file(outputFile)
from pytrec_eval import TrecRun from pytrec_eval import QRels from pytrec_eval import evaluate from pytrec_eval import metrics from pytrec_eval import precisionAt run = TrecRun( "/home/smsarwar/PycharmProjects/deep-siamese-text-similarity/results/results_multitask.txt" ) qrel = QRels( '/home/smsarwar/PycharmProjects/deep-siamese-text-similarity/results/qrel.txt' ) print(qrel) print(evaluate(run, qrel, metrics.recall)) print(evaluate(run, qrel, metrics.precisionAt(10))) print(evaluate(run, qrel, metrics.avgPrec)) #print(metrics.recall(run, qrel, detailed=True))
sdm_doc_score_map = get_scores(sdm_docs[str(topic_number)]) cur_topic_subgraph = scores[str(topic_number)] doc_count = 1 for doc, score in sorted(cur_topic_subgraph.items(), key=lambda item: item[1], reverse = True): sdm_score = float(sdm_doc_score_map[doc]) try: centrality_score = normalize(score, cur_topic_subgraph.values()) except: centrality_score = 0 combined_score = lambda1*sdm_score + lambda2*centrality_score temp_results.append(str(topic_number) + " Q0 " + doc + " " + str(doc_count) + " " + str(combined_score)+ " STANDARD") doc_count+=1 with open("temp_file.test", "w") as outfile: outfile.write("\n".join(temp_results)) run = pytrec_eval.TrecRun('temp_file.test') qrels = pytrec_eval.QRels('qrels_file.test') curr_result = pytrec_eval.evaluate(run, qrels, [pytrec_eval.ndcg])[0] if curr_result > best_score: if best_results is not None: best_results.clear() best_score = curr_result best_results = list(temp_results) print("Run completed with lambda1=" + str(lambda1) + ", lambda2=" + str(lambda2) + " and NDCG=" + str(curr_result) + ". Took: " + str(time() - start_load) + " s") for result in best_results: output.append(result) with open("results_file.test", "w") as outfile: outfile.write("\n".join(output)) print ("Results took %.2f seconds to run." %(time() - start_project))