def reciprocal_rank_at_recall(run, label_store): scores_by_topic = dict() ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = set(get_all_subtopics(label_store, topic_id)) seen_subtopics = set() for idx, result in enumerate(results): assert idx == result['rank'] - 1 # check off seen subtopics for subtopic, conf in get_best_subtopics(result['subtopics']): seen_subtopics.add(subtopic) if len(seen_subtopics) == len(subtopic_ids): break scores_by_topic[topic_id] = 1 / (idx + 1) ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) run['scores']['reciprocal_rank_at_recall'] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}
def precision_at_recall(run, label_store): scores_by_topic = dict() ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = set(get_all_subtopics(label_store, topic_id)) seen_subtopics = set() relevant_docs = 0 for idx, result in enumerate(results): assert idx == result['rank'] - 1 result_subtopics = \ {subtopic for subtopic, conf in get_best_subtopics(result['subtopics'])} if result['on_topic']: relevant_docs += 1 seen_subtopics.update(result_subtopics) if len(seen_subtopics) == len(subtopic_ids): break ## precision is number of documents relevant at stopping point p = relevant_docs / (idx + 1) scores_by_topic[topic_id] = p ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) run['scores']['precision_at_recall'] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}
def reciprocal_rank_at_recall(run, label_store): scores_by_topic = dict() ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = set(get_all_subtopics(label_store, topic_id)) seen_subtopics = set() for idx, result in enumerate(results): assert idx == result['rank'] - 1 # check off seen subtopics for subtopic, conf in get_best_subtopics(result['subtopics']): seen_subtopics.add(subtopic) if len(seen_subtopics) == len(subtopic_ids): break scores_by_topic[topic_id] = 1/(idx + 1) ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) run['scores']['reciprocal_rank_at_recall'] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}
def precision_at_recall(run, label_store): scores_by_topic = dict() ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = set(get_all_subtopics(label_store, topic_id)) seen_subtopics = set() relevant_docs = 0 for idx, result in enumerate(results): assert idx == result['rank'] - 1 result_subtopics = \ {subtopic for subtopic, conf in get_best_subtopics(result['subtopics'])} if result['on_topic']: relevant_docs += 1 seen_subtopics.update(result_subtopics) if len(seen_subtopics) == len(subtopic_ids): break ## precision is number of documents relevant at stopping point p = relevant_docs/(idx + 1) scores_by_topic[topic_id] = p ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) run['scores']['precision_at_recall'] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}
def average_err(run, label_store, mean_type='arithmetic', relevance_metric='graded'): ''' mean_type can be `arithmetic' or `harmonic' ''' scores_by_topic = dict() relevance_func = relevance_metrics[relevance_metric] ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = list(set(get_all_subtopics(label_store, topic_id))) # for each subtopic, compute a running stopping_p # and score p_continue = defaultdict(lambda: 1) score = defaultdict(float) for idx, result in enumerate(results): assert idx == result['rank'] - 1 for subtopic, conf in get_best_subtopics(result['subtopics']): rel = relevance_func(conf) p_stop_here = p_continue[subtopic] * rel score[subtopic] += p_stop_here / (idx + 1) ## update stopping probabilities p_continue[subtopic] *= (1 - rel) ## precision is number of documents relevant at stopping point if mean_type == 'arithmetic': scores_by_topic[topic_id] = mean(score.values()) elif mean_type == 'harmonic': scores_by_topic[topic_id] = harmonic_mean(score.values()) else: sys.exit('Error: invalid mean type specified.') ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) scorer_name = 'average_err_%s' % mean_type run['scores'][scorer_name] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}
def average_err(run, label_store, mean_type='arithmetic', relevance_metric='graded'): ''' mean_type can be `arithmetic' or `harmonic' ''' scores_by_topic = dict() relevance_func = relevance_metrics[relevance_metric] ## score for each topic for topic_id, results in run['results'].items(): ## get all subtopics for the topic subtopic_ids = list(set(get_all_subtopics(label_store, topic_id))) # for each subtopic, compute a running stopping_p # and score p_continue = defaultdict(lambda: 1) score = defaultdict(float) for idx, result in enumerate(results): assert idx == result['rank'] - 1 for subtopic, conf in get_best_subtopics(result['subtopics']): rel = relevance_func(conf) p_stop_here = p_continue[subtopic]*rel score[subtopic] += p_stop_here/(idx+1) ## update stopping probabilities p_continue[subtopic] *= (1-rel) ## precision is number of documents relevant at stopping point if mean_type == 'arithmetic': scores_by_topic[topic_id] = mean(score.values()) elif mean_type == 'harmonic': scores_by_topic[topic_id] = harmonic_mean(score.values()) else: sys.exit('Error: invalid mean type specified.') ## macro average over all the topics macro_avg = mean(scores_by_topic.values()) scorer_name = 'average_err_%s' % mean_type run['scores'][scorer_name] = \ {'scores_by_topic': scores_by_topic, 'macro_average': macro_avg}