Exemplo n.º 1
0
def evaluate(ranking, gold):
    for qid in gold:
        gold_sorted = sorted(gold[qid], key=itemgetter(2), reverse=True)
        pred_sorted = ranking[qid]
        pred_sorted = sorted(pred_sorted, key=itemgetter(2), reverse=True)

        gold[qid], ranking[qid] = [], []
        for i, row in enumerate(gold_sorted):
            relevant, gold_score, aid = row
            gold[qid].append((relevant, gold_score, aid))

            pred_score = pred_sorted[i][1]
            ranking[qid].append((relevant, pred_score, aid))

    for qid in gold:
        # Sort by IR score.
        gold_sorted = sorted(gold[qid], key=itemgetter(1), reverse=True)

        # Sort by SVM prediction score.
        pred_sorted = ranking[qid]
        pred_sorted = sorted(pred_sorted, key=itemgetter(1), reverse=True)

        gold[qid] = [rel for rel, score, aid in gold_sorted]
        ranking[qid] = [rel for rel, score, aid in pred_sorted]

    map_gold = metrics.map(gold, 10)
    map_pred = metrics.map(ranking, 10)
    return map_gold, map_pred
Exemplo n.º 2
0
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", 
                  format="trec",
                  th=50, 
                  verbose=False,
                  reranking_th=-100.0,
                  ignore_noanswer=False, ignore_allanswer=False):
	ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, 
		                              reranking_th=reranking_th, 
		                              ignore_noanswer=ignore_noanswer,
								  	ignore_allanswer=ignore_allanswer)

	# evaluate IR
	prec_se = metrics.recall_of_1(ir, th)
	acc_se = metrics.accuracy(ir, th)
	acc_se1 = metrics.accuracy1(ir, th)
	acc_se2 = metrics.accuracy2(ir, th)

	# evaluate SVM
	prec_svm = metrics.recall_of_1(svm, th)
	acc_svm = metrics.accuracy(svm, th)
	acc_svm1 = metrics.accuracy1(svm, th)
	acc_svm2 = metrics.accuracy2(svm, th)

	mrr_se = metrics.mrr(ir, th)
	mrr_svm = metrics.mrr(svm, th)
	map_se = metrics.map(ir)
	map_svm = metrics.map(svm)

	avg_acc1_svm = metrics.avg_acc1(svm, th)
	avg_acc1_ir = metrics.avg_acc1(ir, th)

	'''
	print "%13s %5s" %("IR", "SVM")
	print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm)
	print "MAP: %5.4f %5.4f" %(map_se, map_svm)
	print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm)
	print "%16s %6s  %14s %6s  %14s %6s  %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM")
	'''
	rec1_se =-10
	rec1_svm = -10
	for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1):
		#print "REC-1@%02d: %6.2f %6.2f  ACC@%02d: %6.2f %6.2f  AC1@%02d: %6.2f %6.2f  AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)
		if (rec1_se<-5):
			rec1_se = p_se
			rec1_svm = p_svm


	'''
	print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)"
	print "ACC   - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions"
	print "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)"
	print "AC2   - the absolute number of correct answers at @X"
	'''

	print "Table view"
	print "	MRR	MAP	P@1"
	print "REF_FILE	%5.2f	%5.2f	%5.2f" % (mrr_se, map_se*100, rec1_se)
	print "SVM	%5.2f	%5.2f	%5.2f" % (mrr_svm, map_svm*100, rec1_svm)
Exemplo n.º 3
0
def eval_reranker(res_fname="svm.test.res",
                  pred_fname="svm.train.pred",
                  format="trec",
                  th=10,
                  verbose=False,
                  reranking_th=0.0,
                  ignore_noanswer=False):
    ir, svm, conf_matrix = read_res_pred_files(res_fname,
                                               pred_fname,
                                               format,
                                               verbose,
                                               reranking_th=reranking_th,
                                               ignore_noanswer=ignore_noanswer)
    # Calculate standard P, R, F1, Acc
    acc = 1.0 * (
        conf_matrix['true']['true'] + conf_matrix['false']['false']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['false'] +
            conf_matrix['true']['false'] + conf_matrix['false']['true'])
    p = 0
    if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0:
        p = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['true'])
    r = 0
    if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0:
        r = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['true']['false'])
    f1 = 0
    if (p + r) > 0:
        f1 = 2.0 * p * r / (p + r)

    # evaluate IR
    prec_se = metrics.recall_of_1(ir, th)
    acc_se = metrics.accuracy(ir, th)
    acc_se1 = metrics.accuracy1(ir, th)
    acc_se2 = metrics.accuracy2(ir, th)

    # evaluate SVM
    prec_svm = metrics.recall_of_1(svm, th)
    acc_svm = metrics.accuracy(svm, th)
    acc_svm1 = metrics.accuracy1(svm, th)
    acc_svm2 = metrics.accuracy2(svm, th)

    mrr_se = metrics.mrr(ir, th)
    mrr_svm = metrics.mrr(svm, th)
    map_se = metrics.map(ir, th)
    map_svm = metrics.map(svm, th)

    avg_acc1_svm = metrics.avg_acc1(svm, th)
    avg_acc1_ir = metrics.avg_acc1(ir, th)

    print "acc\tf1\tMAP\tMRR\tAvgRec"
    print "%.4f %4.4f %4.4f %4.4f %4.4f" % (acc, f1, map_svm, mrr_svm,
                                            avg_acc1_svm)
Exemplo n.º 4
0
def eval_search_engine(res_fname, format, th=50):
    ir = read_res_file(res_fname, format)

    # evaluate IR
    rec = metrics.recall_of_1(ir, th)
    acc = metrics.accuracy(ir, th)
    acc1 = metrics.accuracy1(ir, th)
    acc2 = metrics.accuracy2(ir, th)

    mrr = metrics.mrr(ir, th)

    # MAP
    map_ir = metrics.map(ir)

    print "%10s" % "IR"
    print "MRR: %5.2f" % mrr
    print "MAP: %5.2f" % map_ir
    for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1):
        print "REC-1@%02d: %6.2f  ACC@%02d: %6.2f  AC1@%02d: %6.2f  AC2@%02d: %4.0f" % (
            i, r, i, a, i, a1, i, a2)
    print
    print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)"
    print "ACC   - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions"
    print "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)"
    print "AC2   - the absolute number of correct answers at @X"
Exemplo n.º 5
0
def eval_search_engine(res_fname, format, th=50):
	ir = read_res_file(res_fname, format)		

	# evaluate IR
	rec = metrics.recall_of_1(ir, th)
	acc = metrics.accuracy(ir, th)
	acc1 = metrics.accuracy1(ir, th)
	acc2 = metrics.accuracy2(ir, th)

	mrr = metrics.mrr(ir, th)

  # MAP
	map_ir = metrics.map(ir)
  

	print "%10s" %"IR"
	print "MRR: %5.2f" % mrr
	print "MAP: %5.2f" % map_ir
	for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1):
		print "REC-1@%02d: %6.2f  ACC@%02d: %6.2f  AC1@%02d: %6.2f  AC2@%02d: %4.0f" %(i, r, i, a, i, a1, i, a2)
	print
	print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)"
	print "ACC   - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions"
	print "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)"
	print "AC2   - the absolute number of correct answers at @X"
Exemplo n.º 6
0
def eval_reranker(resPredIterable,
                  th=10,
                  reranking_th=0.0,
                  ignore_noanswer=False):
    ir, svm, conf_matrix = read_res_pred_files(resPredIterable, reranking_th=reranking_th,
                                      ignore_noanswer=ignore_noanswer)        
    # Calculate standard P, R, F1, Acc
    acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true'])
    p = 0
    if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0:
        p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true'])
    r = 0
    if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0:
        r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false'])
    f1 = 0
    if (p + r) > 0:
        f1 = 2.0 * p * r / (p + r)

    mrr_svm = metrics.mrr(svm, th)
    map_svm = metrics.map(svm, th)

    scores = {}
    scores['map'] = map_svm
    scores['accuracy'] = acc
    scores['precision'] = p
    scores['recall'] = r
    scores['f1'] = f1
    scores['mrr'] = mrr_svm
    return scores
    def check_engine_quality(self, query_num, list_of_docs):
        """
        :param query_num:
        :param list_of_docs:
        :return: no return. prints metrics of the query. precision, recall, map.
        """

        benchmark_path = "data\\benchmark_lbls_train.csv"
        df = pd.read_csv(benchmark_path)

        df_prec = df[df['query'] == query_num]
        df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)]
        dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict()

        rmv_lst = []

        ranking = []
        # Add to list for rank
        for doc in list_of_docs:
            try:
                ranking.append(dict_for_data[int(doc)])
            except:
                rmv_lst.append(doc)
        for d in rmv_lst:
            list_of_docs.remove(d)

        data_df = pd.DataFrame({
            'query': query_num,
            'tweet': list_of_docs,
            'y_true': ranking
        })

        df_rec = df[df['query'] == query_num]
        recall_total = len(df_rec[df_rec['y_true'] == 1.0])

        # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0]))
        # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0]))
        # print("found total of", len(df_prec), "tagged docs")
        # Calculate and print
        prec5 = metrics.precision_at_n(data_df, query_num, 5)
        prec10 = metrics.precision_at_n(data_df, query_num, 10)
        prec50 = metrics.precision_at_n(data_df, query_num, 50)
        prec_total = metrics.precision(data_df, True, query_number=query_num)
        map_of_query = metrics.map(data_df)
        recall_val = metrics.recall_single(data_df, recall_total, query_num)
        self.map_list.append(map_of_query)
        self.prec5_list.append(prec5)
        self.prec10_list.append(prec10)
        self.prec50_list.append(prec50)
        self.prec_total_list.append(prec_total)
        self.recall_list.append(recall_val)

        print()
        print("precision at 5 of query", query_num, "is :", prec5)
        print("precision at 10 of query", query_num, "is :", prec10)
        print("precision at 50 of query", query_num, "is :", prec50)
        print("precision of query", query_num, "is :", prec_total)
        print("recall of query", query_num, "is :", recall_val)
        print("map of query", query_num, "is :", map_of_query)
Exemplo n.º 8
0
def get_cv_evaluation_results(qid_aid_label_list, y_pred):
    predictions_dict = get_cv_ranked_predictions_dict(qid_aid_label_list,
                                                      y_pred)
    logging.debug("Num of questions: %d" % (len(predictions_dict)))

    mrr_score = m.mrr(predictions_dict, 1000)
    map_score = m.map(predictions_dict) * 100
    p1_score = m.recall_of_1(predictions_dict, 1000)[0]

    return mrr_score, map_score, p1_score
Exemplo n.º 9
0
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", 
                  format="trec",
                  th=50, 
                  verbose=False,
                  reranking_th=0.0,
                  ignore_noanswer=False):
	ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, 
		                              reranking_th=reranking_th, 
		                              ignore_noanswer=ignore_noanswer)		
	# evaluate IR
	prec_se = metrics.recall_of_1(ir, th)
	acc_se = metrics.accuracy(ir, th)
	acc_se1 = metrics.accuracy1(ir, th)
	acc_se2 = metrics.accuracy2(ir, th)

	# evaluate SVM
	prec_svm = metrics.recall_of_1(svm, th)
	acc_svm = metrics.accuracy(svm, th)
	acc_svm1 = metrics.accuracy1(svm, th)
	acc_svm2 = metrics.accuracy2(svm, th)

	mrr_se = metrics.mrr(ir, th)
	mrr_svm = metrics.mrr(svm, th)
	map_se = metrics.map(ir)
	map_svm = metrics.map(svm)

	avg_acc1_svm = metrics.avg_acc1(svm, th)
	avg_acc1_ir = metrics.avg_acc1(ir, th)

	print "%13s %5s" %("IR", "SVM")
	print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm)
	print "MAP: %5.4f %5.4f" %(map_se, map_svm)
	print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm)
	print "%16s %6s  %14s %6s  %14s %6s  %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM")
	for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1):
		print "REC-1@%02d: %6.2f %6.2f  ACC@%02d: %6.2f %6.2f  AC1@%02d: %6.2f %6.2f  AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)
	print
	print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)"
	print "ACC   - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions"
	print "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)"
	print "AC2   - the absolute number of correct answers at @X"
Exemplo n.º 10
0
def get_evaluation_results(df,
                           y_pred,
                           skip_all_positives_and_all_negatives=True):
    predictions_dict = get_ranked_predictions_dict(
        df,
        y_pred,
        skip_all_positives_and_all_negatives=
        skip_all_positives_and_all_negatives)
    logging.debug("Num of questions: %d" % (len(predictions_dict)))

    mrr_score = m.mrr(predictions_dict, 1000)
    map_score = m.map(predictions_dict) * 100
    p1_score = m.recall_of_1(predictions_dict, 1000)[0]

    return mrr_score, map_score, p1_score
Exemplo n.º 11
0
def cal_score(ref_lines, probs):
    reranking_th = 0.0
    line_count = 0
    pred_lines = defaultdict(list)
    for ref_line in ref_lines:
        qid, aid, lbl = ref_line[0], ref_line[1], ref_line[2]
        pred_lines[qid].append((lbl, probs[line_count][0], aid))
        line_count += 1
    # for qid in pred_lines.keys():
    #     candidates = pred_lines[qid]
    #     if all(relevant == "false" for relevant, _, _ in candidates):
    #         del pred_lines[qid]
    for qid in pred_lines.keys():
        pred_sorted = pred_lines[qid]
        max_score = max([score for rel, score, aid in pred_sorted])
        if max_score >= reranking_th:
            pred_sorted = sorted(pred_sorted, key=itemgetter(1), reverse=True)
        pred_lines[qid] = [rel for rel, score, aid in pred_sorted]
    MAP = metrics.map(pred_lines, 10)
    MRR = metrics.mrr(pred_lines, 10)
    return MAP, MRR
Exemplo n.º 12
0
def test(engine, options):
    queries = pd.read_csv(os.path.join('data', 'queries_train.tsv'), sep='\t')
    bench_lbls = pd.read_csv(os.path.join('data', 'benchmark_lbls_train.csv'),
                             dtype={
                                 'query': int,
                                 'tweet': str,
                                 'y_true': int
                             })
    q2n_relevant = bench_lbls.groupby('query')['y_true'].sum().to_dict()
    queries_results = []
    q_times = []
    for i, row in queries.iterrows():
        q_id = row['query_id']
        q_keywords = row['keywords']
        start_time = time.time()
        q_n_res, q_res = engine.search(q_keywords, options['methods'])
        end_time = time.time()
        q_time = end_time - start_time
        q_times.append(q_time)
        queries_results.extend([(q_id, str(doc_id)) for doc_id in q_res])
        if q_time > 10:
            print(f'Query time exceeded: {options}')
    queries_results = pd.DataFrame(queries_results, columns=['query', 'tweet'])
    q_results_labeled = pd.merge(queries_results,
                                 bench_lbls,
                                 on=['query', 'tweet'],
                                 how='inner',
                                 suffixes=('_result', '_bench'))
    options['max_q_time'] = max(q_times)
    options['avg_q_time'] = sum(q_times) / len(q_times)
    options['MAP'] = metrics.map(q_results_labeled)
    options['precision'] = metrics.precision(q_results_labeled)
    options['precision@5'] = metrics.precision(
        q_results_labeled.groupby('query').head(5))
    options['precision@10'] = metrics.precision(
        q_results_labeled.groupby('query').head(10))
    options['precision@50'] = metrics.precision(
        q_results_labeled.groupby('query').head(50))
    options['recall'] = metrics.recall(q_results_labeled, q2n_relevant)
    save_to_csv(options)
Exemplo n.º 13
0
    def _run_evaluation_on_selected_users(self, recommender_object,
                                          usersToEvaluate):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        for test_user in usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_users_evaluated += 1

            recommended_items = recommender_object.recommend(
                test_user,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(
                        n_users_evaluated, 100.0 * float(n_users_evaluated) /
                        len(self.usersToEvaluate),
                        time.time() - start_time,
                        float(n_users_evaluated) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        return results_dict, n_users_evaluated
Exemplo n.º 14
0
                                                 bench_lbls,
                                                 on=['query', 'tweet'],
                                                 how='inner',
                                                 suffixes=('_result',
                                                           '_bench'))
                    # q_results_labeled.rename(columns={'y_true': 'label'})
                    zero_recall_qs = [q_id for q_id, rel in q2n_relevant.items() \
                                      if metrics.recall_single(q_results_labeled, rel, q_id) == 0]
                    if len(zero_recall_qs) > 0:
                        logging.warning(
                            f"{engine_module}'s recall for the following queries was zero {zero_recall_qs}."
                        )

                if q_results_labeled is not None:
                    # test that MAP > 0
                    results_map = metrics.map(q_results_labeled)
                    logging.debug(
                        f"{engine_module} results have MAP value of {results_map}."
                    )
                    if results_map <= 0 or results_map > 1:
                        logging.error(
                            f'{engine_module} results MAP value is out of range (0,1).'
                        )

                    # test that the average across queries of precision,
                    # precision@5, precision@10, precision@50, and recall
                    # is in [0,1].
                    prec, p5, p10, p50, recall = \
                        metrics.precision(q_results_labeled), \
                        metrics.precision(q_results_labeled.groupby('query').head(5)), \
                        metrics.precision(q_results_labeled.groupby('query').head(10)), \
    relevant_items = test[test_user].indices
    if len(relevant_items) > 0:
        neval += 1
        #
        # TODO: Here you can write to file the recommendations for each user in the test split.
        # WARNING: there is a catch with the item idx!
        #
        # this will rank *all* items
        recommended_items = recommender.recommend(user_profile,
                                                  exclude_seen=True)
        # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k)
        # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True)
        roc_auc_ += roc_auc(recommended_items, relevant_items)
        precision_ += precision(recommended_items, relevant_items, at=at)
        recall_ += recall(recommended_items, relevant_items, at=at)
        map_ += map(recommended_items, relevant_items, at=at)
        mrr_ += rr(recommended_items, relevant_items, at=at)
        ndcg_ += ndcg(recommended_items,
                      relevant_items,
                      relevance=test[test_user].data,
                      at=at)
roc_auc_ /= neval
precision_ /= neval
recall_ /= neval
map_ /= neval
mrr_ /= neval
ndcg_ /= neval

logger.info('Ranking quality')
logger.info('ROC-AUC: {:.4f}'.format(roc_auc_))
logger.info('Precision@{}: {:.4f}'.format(at, precision_))
Exemplo n.º 16
0
def stats_cv(path=".",  format="trec", prefix="svm", th=50, verbose=False):
  mrrs_se = []
  mrrs_svm = []
  abs_mrrs = []
  rel_mrrs = []

  maps_se = []
  maps_svm = []
  abs_maps = []
  rel_maps = []

  recalls1_se = []
  recalls1_svm = []
  abs_recalls = []
  rel_recalls = []

  oracle_mrrs = []
  oracle_maps = []
  oracle_recs1 = []

  num_folds = 0
  
  print "%13s %5s %7s %7s" %("IR", "SVM", "(abs)", "(rel)")
  for fold in sorted(os.listdir(path)):
    currentFold = os.path.join(path, fold)
    if not os.path.isdir(currentFold):
      continue
    if not fold.startswith("fold"):
      logging.warn("Directories containing CV folds should start with 'fold'")
      continue
    print fold

    # Relevancy file
    res_fname = os.path.join(currentFold, "%s.test.res" % prefix)
    if not os.path.exists(res_fname):
      logging.error("Relevancy file not found: %s", res_fname)
      sys.exit(1)

    # Predictions file
    pred_fname = os.path.join(currentFold, "%s.pred" % prefix)
    if not os.path.exists(pred_fname):
      logging.error("SVM prediction file not found: %s", pred_fname)
      sys.exit(1)

    try:
      ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose)   
    except:
      logging.error("Failed to process input files: %s %s", res_fname, pred_fname)
      logging.error("Check that the input file format is correct")
      sys.exit(1)

    # MRR
    mrr_se = metrics.mrr(ir, th) or 1
    mrr_svm = metrics.mrr(svm, th) 
    mrrs_se.append(mrr_se)
    mrrs_svm.append(mrr_svm) 

    # improvement
    abs_mrr_diff = mrr_svm - mrr_se
    rel_mrr_diff = (mrr_svm - mrr_se)*100/mrr_se
    abs_mrrs.append(abs_mrr_diff)
    rel_mrrs.append(rel_mrr_diff)

    print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) 

    # MAP
    map_se = metrics.map(ir) or 1
    map_svm = metrics.map(svm)
    maps_se.append(map_se) 
    maps_svm.append(map_svm)

    # improvement
    abs_map_diff = map_svm - map_se
    rel_map_diff = (map_svm - map_se)*100/map_se
    abs_maps.append(abs_map_diff)
    rel_maps.append(rel_map_diff)
    print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (map_se, map_svm, abs_map_diff, rel_map_diff) 

    # Recall-of-1@1
    rec_se = metrics.recall_of_1(ir, th)[0] or 1
    rec_svm = metrics.recall_of_1(svm, th)[0]
    recalls1_se.append(rec_se)
    recalls1_svm.append(rec_svm)

    # improvement
    abs_rec_diff = rec_svm - rec_se
    rel_rec_diff = (rec_svm - rec_se)*100/rec_se
    abs_recalls.append(abs_rec_diff)
    rel_recalls.append(rel_rec_diff)

    print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (rec_se, rec_svm, abs_rec_diff, rel_rec_diff)   

    num_folds += 1

    '''
    mrr_oracle = metrics.oracle_mrr(ir, th)
    map_oracle = metrics.oracle_map(ir)
    prec_oracle = metrics.oracle_precision(ir, th)[0]
    rec1_oracle = metrics.oracle_recall_of_1(ir, th)[0]

    oracle_mrrs.append(mrr_oracle)
    oracle_maps.append(map_oracle)
    oracle_recs1.append(rec1_oracle)

    print "Oracle MRR: %5.2f, Oracle MAP: %5.2f, Oracle prec: %5.2f, Oracle rec@1: %5.2f" % (mrr_oracle, map_oracle, prec_oracle, rec1_oracle)
    '''
  # mrrs
  avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se)
  avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm)
  avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs)
  avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs)
  #oracle_avg_mrr, std_oracle_avg_mrr = mean_and_std(oracle_mrrs)

  # maps
  avg_map_se, std_map_se = mean_and_std(maps_se)
  avg_map_svm, std_map_svm = mean_and_std(maps_svm)
  avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps)
  avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps)
  #oracle_avg_map, std_oracle_avg_map = mean_and_std(oracle_maps)

  # recall
  avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se)  # se 
  avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm)  # svm
  avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std(abs_recalls)  # absolute
  avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std(rel_recalls)  # relative
  #oracle_avg_rec1, std_oracle_avg_rec1 = mean_and_std(oracle_recs1)

  FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f"
  #ORACLE_FMT = u"Oracle MRR: %5.2f \u00B1 %4.2f, Oracle MAP: %5.2f \u00B1 %4.2f, Oracle P@1: %5.2f \u00B1 %4.2f"
  print
  print "Averaged over %s folds" % num_folds
  print "%17s %12s %14s %14s" %("IR", "SVM", "(abs)", "(rel)")
  print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr)
  print FMT % ("MAP", avg_map_se, std_map_se, avg_map_svm, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map)
  print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1)
Exemplo n.º 17
0
def stats_cv(path=".",
             format="trec",
             prefix="svm",
             th=50,
             suf="",
             verbose=False,
             truth_file=None,
             ignore_noanswer=False,
             cut_truth_map_at_N=None):
    mrrs_se = []
    mrrs_svm = []
    abs_mrrs = []
    rel_mrrs = []

    maps_se = []
    maps_svm = []
    abs_maps = []
    rel_maps = []

    recalls1_se = []
    recalls1_svm = []
    abs_recalls = []
    rel_recalls = []

    num_folds = 0
    truth = read_truth_file(truth_file, format, cut_truth_map_at_N)
    print "%13s %5s %7s %7s" % ("IR", "SVM", "(abs)", "(rel)")
    for fold in sorted(os.listdir(path)):
        currentFold = os.path.join(path, fold)
        if not os.path.isdir(currentFold):
            continue
        if not fold.startswith("fold"):
            logging.warn(
                "Directories containing CV folds should start with 'fold'")
            continue
        print fold

        # Relevancy file
        res_fname = os.path.join(currentFold, "%s.relevancy" % prefix)
        if not os.path.exists(res_fname):
            logging.error("Relevancy file not found: %s", res_fname)
            sys.exit(1)

        # Predictions file
        pred_fname = os.path.join(currentFold, "%s.pred" % (prefix + suf))
        if not os.path.exists(pred_fname):
            logging.error("SVM prediction file not found: %s", pred_fname)
            sys.exit(1)

        try:
            ir, svm = read_res_pred_files(res_fname,
                                          pred_fname,
                                          format,
                                          verbose,
                                          ignore_noanswer=ignore_noanswer,
                                          truth_map=truth)
        except:
            logging.error("Failed to process input files: %s %s", res_fname,
                          pred_fname)
            logging.error("Check that the input file format is correct")
            sys.exit(1)

        # MRR
        mrr_se = metrics.mrr(ir, th)
        mrr_svm = metrics.mrr(svm, th)
        mrrs_se.append(mrr_se)
        mrrs_svm.append(mrr_svm)

        # improvement
        abs_mrr_diff = mrr_svm - mrr_se
        rel_mrr_diff = (mrr_svm - mrr_se) * 100 / mrr_se
        abs_mrrs.append(abs_mrr_diff)
        rel_mrrs.append(rel_mrr_diff)

        print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (
            mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff)

        # MAP
        map_se = metrics.map(ir)
        map_svm = metrics.map(svm)
        maps_se.append(map_se)
        maps_svm.append(map_svm)
        # improvement
        abs_map_diff = map_svm - map_se
        rel_map_diff = (map_svm - map_se) * 100 / map_se
        abs_maps.append(abs_map_diff)
        rel_maps.append(rel_map_diff)
        print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (
            map_se * 100, map_svm * 100, abs_map_diff, rel_map_diff)

        # Recall-of-1@1
        rec_se = metrics.recall_of_1(ir, th)[0]
        rec_svm = metrics.recall_of_1(svm, th)[0]
        recalls1_se.append(rec_se)
        recalls1_svm.append(rec_svm)

        # improvement
        abs_rec_diff = rec_svm - rec_se
        rel_rec_diff = (rec_svm - rec_se) * 100 / rec_se
        abs_recalls.append(abs_rec_diff)
        rel_recalls.append(rel_rec_diff)

        print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (
            rec_se, rec_svm, abs_rec_diff, rel_rec_diff)

        num_folds += 1

    # mrrs
    avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se)
    avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm)
    avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs)
    avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs)

    # maps
    avg_map_se, std_map_se = mean_and_std(maps_se)
    avg_map_svm, std_map_svm = mean_and_std(maps_svm)
    avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps)
    avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps)

    # recall
    avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se)  # se
    avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm)  # svm
    avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std(
        abs_recalls)  # absolute
    avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std(
        rel_recalls)  # relative

    FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f"
    print
    print "Averaged over %s folds" % num_folds
    print "%17s %12s %14s %14s" % ("IR", "SVM", "(abs)", "(rel)")
    print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm,
                 avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr,
                 std_rel_impr_mrr)
    print FMT % ("MAP", avg_map_se * 100, std_map_se, avg_map_svm * 100,
                 std_map_svm, avg_abs_impr_map, std_abs_impr_map,
                 avg_rel_impr_map, std_rel_impr_map)
    print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm,
                 avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1,
                 std_rel_impr_rec1)
    print "Table view"
    print "	MRR	MAP	P@1"
    print u"IR	%5.2f\u00B1%4.2f	%5.2f\u00B1%4.2f	 %5.2f\u00B1%4.2f" % (
        avg_mrr_se, std_mrr_se, avg_map_se * 100, std_map_se * 100,
        avg_rec1_se, std_rec1_se)
    print u"SVM	%5.2f\u00B1%4.2f	%5.2f\u00B1%4.2f	 %5.2f\u00B1%4.2f" % (
        avg_mrr_svm, std_mrr_svm, avg_map_svm * 100, std_map_svm * 100,
        avg_rec1_svm, std_rec1_svm)
Exemplo n.º 18
0
def eval_reranker(res_fname="svm.test.res",
                  pred_fname="svm.train.pred",
                  format="trec",
                  th=10,
                  verbose=False,
                  reranking_th=0.0,
                  ignore_noanswer=False):
    ir, svm, conf_matrix = read_res_pred_files(res_fname,
                                               pred_fname,
                                               format,
                                               verbose,
                                               reranking_th=reranking_th,
                                               ignore_noanswer=ignore_noanswer)
    # Calculate standard P, R, F1, Acc
    acc = 1.0 * (
        conf_matrix['true']['true'] + conf_matrix['false']['false']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['false'] +
            conf_matrix['true']['false'] + conf_matrix['false']['true'])
    p = 0
    if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0:
        p = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['true'])
    r = 0
    if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0:
        r = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['true']['false'])
    f1 = 0
    if (p + r) > 0:
        f1 = 2.0 * p * r / (p + r)

    # evaluate IR
    prec_se = metrics.recall_of_1(ir, th)
    acc_se = metrics.accuracy(ir, th)
    acc_se1 = metrics.accuracy1(ir, th)
    acc_se2 = metrics.accuracy2(ir, th)

    # evaluate SVM
    prec_svm = metrics.recall_of_1(svm, th)
    acc_svm = metrics.accuracy(svm, th)
    acc_svm1 = metrics.accuracy1(svm, th)
    acc_svm2 = metrics.accuracy2(svm, th)

    mrr_se = metrics.mrr(ir, th)
    mrr_svm = metrics.mrr(svm, th)
    map_se = metrics.map(ir, th)
    map_svm = metrics.map(svm, th)

    avg_acc1_svm = metrics.avg_acc1(svm, th)
    avg_acc1_ir = metrics.avg_acc1(ir, th)

    print("")
    print("*** Official score (MAP for SYS): %5.4f" % (map_svm))
    print("")
    print("")
    print("******************************")
    print("*** Classification results ***")
    print("******************************")
    print("")
    print("Acc = %5.4f" % (acc))
    print("P   = %5.4f" % (p))
    print("R   = %5.4f" % (r))
    print("F1  = %5.4f" % (f1))
    print("")
    print("")
    print("********************************")
    print("*** Detailed ranking results ***")
    print("********************************")
    print("")
    print("IR  -- Score for the output of the IR system (baseline).")
    print("SYS -- Score for the output of the tested system.")
    print("")
    print("%13s %5s" % ("IR", "SYS"))
    print("MAP   : %5.4f %5.4f" % (map_se, map_svm))
    print("AvgRec: %5.4f %5.4f" % (avg_acc1_ir, avg_acc1_svm))
    print("MRR   : %6.2f %6.2f" % (mrr_se, mrr_svm))
    print("%16s %6s  %14s %6s  %14s %6s  %12s %4s" %
          ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS"))
    for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2,
            a_svm2) in enumerate(
                zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1,
                    acc_se2, acc_svm2), 1):
        print(
            "REC-1@%02d: %6.2f %6.2f  ACC@%02d: %6.2f %6.2f  AC1@%02d: %6.2f %6.2f  AC2@%02d: %4.0f %4.0f"
            % (i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2,
               a_svm2))
    print()
    print(
        "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)"
    )
    print(
        "ACC   - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions"
    )
    print(
        "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)"
    )
    print("AC2   - the absolute number of correct answers at @X")
Exemplo n.º 19
0
    def _run_evaluation_on_selected_users(self,
                                          recommender_object,
                                          usersToEvaluate,
                                          block_size=1000):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.get_URM_train(),
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(
                usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list = recommender_object.recommend(
                test_user_batch_array,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                user_id = test_user_batch_array[batch_user_index]
                recommended_items = recommended_items_batch_list[
                    batch_user_index]

                # Being the URM CSR, the indices are the non-zero column indexes
                relevant_items = self.get_user_relevant_items(user_id)
                is_relevant = np.in1d(recommended_items,
                                      relevant_items,
                                      assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[
                        0:cutoff]

                    results_current_cutoff[
                        EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                            is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.PRECISION.value] += precision(
                            is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[
                        EvaluatorMetrics.RECALL.value] += recall(
                            is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                           value] += recall_min_test_len(
                                               is_relevant_current_cutoff,
                                               relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                        is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                        is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.NDCG.value] += ndcg(
                            recommended_items_current_cutoff,
                            relevant_items,
                            relevance=self.get_user_test_ratings(user_id),
                            at=cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.HIT_RATE.
                        value] += is_relevant_current_cutoff.sum()
                    results_current_cutoff[
                        EvaluatorMetrics.ARHR.value] += arhr(
                            is_relevant_current_cutoff)

                    results_current_cutoff[
                        EvaluatorMetrics.NOVELTY.value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_GINI.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.SHANNON_ENTROPY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_ITEM.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_USER.
                        value].add_recommendations(
                            recommended_items_current_cutoff, user_id)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[
                            EvaluatorMetrics.DIVERSITY_SIMILARITY.
                            value].add_recommendations(
                                recommended_items_current_cutoff)

                if time.time(
                ) - start_time_print > 30 or n_users_evaluated == len(
                        self.usersToEvaluate):
                    print(
                        "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                        .format(
                            n_users_evaluated,
                            100.0 * float(n_users_evaluated) /
                            len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_users_evaluated) /
                            (time.time() - start_time)))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()

        return results_dict, n_users_evaluated
Exemplo n.º 20
0
def eval_reranker(res_fname="svm.test.res",
                  pred_fname="svm.train.pred",
                  format="trec",
                  th=10,
                  verbose=False,
                  reranking_th=0.0,
                  ignore_noanswer=False):
    ir, svm, conf_matrix = read_res_pred_files(res_fname,
                                               pred_fname,
                                               format,
                                               verbose,
                                               reranking_th=reranking_th,
                                               ignore_noanswer=ignore_noanswer)
    # Calculate standard P, R, F1, Acc
    acc = 1.0 * (
        conf_matrix['true']['true'] + conf_matrix['false']['false']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['false'] +
            conf_matrix['true']['false'] + conf_matrix['false']['true'])
    p = 0
    if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0:
        p = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['false']['true'])
    r = 0
    if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0:
        r = 1.0 * (conf_matrix['true']['true']) / (
            conf_matrix['true']['true'] + conf_matrix['true']['false'])
    f1 = 0
    if (p + r) > 0:
        f1 = 2.0 * p * r / (p + r)

    # evaluate IR
    prec_se = metrics.recall_of_1(ir, th)
    acc_se = metrics.accuracy(ir, th)
    acc_se1 = metrics.accuracy1(ir, th)
    acc_se2 = metrics.accuracy2(ir, th)

    # evaluate SVM
    prec_svm = metrics.recall_of_1(svm, th)
    acc_svm = metrics.accuracy(svm, th)
    acc_svm1 = metrics.accuracy1(svm, th)
    acc_svm2 = metrics.accuracy2(svm, th)

    mrr_se = metrics.mrr(ir, th)
    mrr_svm = metrics.mrr(svm, th)
    map_se = metrics.map(ir, th)
    map_svm = metrics.map(svm, th)

    avg_acc1_svm = metrics.avg_acc1(svm, th)
    avg_acc1_ir = metrics.avg_acc1(ir, th)

    #print ""
    #print "*** Official score (MAP for SYS): %5.4f" %(map_svm)
    #print ""
    #print ""
    #print "******************************"
    #print "*** Classification results ***"
    #print "******************************"
    #print ""
    #print "Acc = %5.4f" %(acc)
    #print "P   = %5.4f" %(p)
    #print "R   = %5.4f" %(r)
    #print "F1  = %5.4f" %(f1)
    #print ""
    #print ""
    #print "********************************"
    #print "*** Detailed ranking results ***"
    #print "********************************"
    #print ""
    #print "IR  -- Score for the output of the IR system (baseline)."
    #print "SYS -- Score for the output of the tested system."
    #print ""
    #print "%13s %5s" %("IR", "SYS")
    #print "MAP   : %5.4f %5.4f" %(map_se, map_svm)
    #print "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm)
    #print "MRR   : %6.2f %6.2f" %(mrr_se, mrr_svm)
    print "MAP   : %5.4f\tMRR   : %5.4f\tAvgRec: %5.4f" % (map_svm, mrr_svm,
                                                           avg_acc1_svm)
    #print "Acc   : %5.4f" %(acc)
    #print "P     : %5.4f" %(p)
    #print "R     : %5.4f" %(r)
    #print "F1    : %5.4f" %(f1)
    """
Exemplo n.º 21
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a Recommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_eval = 0

        self.__all_items = np.arange(0, self.n_items, dtype=np.int)
        self.__all_items = set(self.__all_items)

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_eval += 1

            self.user_specific_remove_items(recommender_object, test_user)

            # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen,
            #                                                  cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag)
            recommended_items = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            recommended_items = np.array(recommended_items[0])

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_eval == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(n_eval,
                            100.0 * float(n_eval) / len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_eval) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_eval > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_eval

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = self.get_result_string(results_dict)

        return (results_dict, results_run_string)
Exemplo n.º 22
0
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred",
                  format="trec",
                  th=10,
                  verbose=False,
                  reranking_th=0.0,
                  ignore_noanswer=False):
    ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose,
                                      reranking_th=reranking_th,
                                      ignore_noanswer=ignore_noanswer)
    # Calculate standard P, R, F1, Acc
    acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true'])
    p = 0
    if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0:
        p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true'])
    r = 0
    if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0:
        r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false'])
    f1 = 0
    if (p + r) > 0:
        f1 = 2.0 * p * r / (p + r)

    # evaluate IR
    prec_se = metrics.recall_of_1(ir, th)
    acc_se = metrics.accuracy(ir, th)
    acc_se1 = metrics.accuracy1(ir, th)
    acc_se2 = metrics.accuracy2(ir, th)

    # evaluate SVM
    prec_svm = metrics.recall_of_1(svm, th)
    acc_svm = metrics.accuracy(svm, th)
    acc_svm1 = metrics.accuracy1(svm, th)
    acc_svm2 = metrics.accuracy2(svm, th)

    mrr_se = metrics.mrr(ir, th)
    mrr_svm = metrics.mrr(svm, th)
    map_se = metrics.map(ir, th)
    map_svm = metrics.map(svm, th)

    avg_acc1_svm = metrics.avg_acc1(svm, th)
    avg_acc1_ir = metrics.avg_acc1(ir, th)

    print ("")
    print ("*** Official score (MAP for SYS): %5.4f" %(map_svm))
    print ("")
    print ("")
    print( "******************************")
    print( "*** Classification results ***")
    print( "******************************")
    print( "")
    print( "Acc = %5.4f" %(acc))
    print( "P   = %5.4f" %(p))
    print( "R   = %5.4f" %(r))
    print( "F1  = %5.4f" %(f1))
    print( "")
    print( "")
    print( "********************************")
    print( "*** Detailed ranking results ***")
    print( "********************************")
    print( "")
    print( "IR  -- Score for the output of the IR system (baseline).")
    print( "SYS -- Score for the output of the tested system.")
    print( "")
    print( "%13s %5s" %("IR", "SYS"))
    print( "MAP   : %5.4f %5.4f" %(map_se, map_svm))
    print( "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm))
    print( "MRR   : %6.2f %6.2f" %(mrr_se, mrr_svm))
    print( "%16s %6s  %14s %6s  %14s %6s  %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS"))
    for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1):
        print( "REC-1@%02d: %6.2f %6.2f  ACC@%02d: %6.2f %6.2f  AC1@%02d: %6.2f %6.2f  AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2))

    print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)")
    print( "ACC   - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions")
    print( "AC1   - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)")
    print( "AC2   - the absolute number of correct answers at @X")

    return map_svm
    def calcQueryEvaluation(q_res, num_q):
        queries_results = pd.DataFrame(q_res, columns=['query', 'tweet'])

        if bench_lbls is not None and len(queries_results) > 0:
            q_results_labeled = pd.merge(queries_results,
                                         bench_lbls,
                                         on=['query', 'tweet'],
                                         how='inner',
                                         suffixes=('_result', '_bench'))
        # q_results_labeled.rename(columns={'y_true': 'label'})
        zero_recall_qs = [q_id for q_id, rel in q2n_relevant.items() \
                          if metrics.recall_single(q_results_labeled, rel, q_id) == 0]
        if len(zero_recall_qs) > 0:
            logging.warning(
                f"{engine_module}'s recall for the following queries was zero {zero_recall_qs}."
            )

        if q_results_labeled is not None:
            # test that MAP > 0
            results_map = metrics.map(q_results_labeled)
            logging.debug(
                f"{engine_module} results have            MAP value of {results_map}."
            )
            if results_map <= 0 or results_map > 1:
                logging.error(
                    f'{engine_module} results MAP value is out of range (0,1).'
                )
            prec, p5, p10, p50, recall = \
                metrics.precision(q_results_labeled), \
                metrics.precision(q_results_labeled.groupby('query').head(5)), \
                metrics.precision(q_results_labeled.groupby('query').head(10)), \
                metrics.precision(q_results_labeled.groupby('query').head(50)), \
                metrics.recall_single(q_results_labeled, q2n_relevant[num_q], num_q)
            # logging.debug(f"{engine_module} on query {num_q} results produced average precision of {prec}.")
            # logging.debug(f"{engine_module} on query {num_q} results produced average precision@5 of {p5}.")
            # logging.debug(f"{engine_module} on query {num_q} results produced average precision@10 of {p10}.")
            # logging.debug(f"{engine_module} on query {num_q} results produced average precision@50 of {p50}.")
            # logging.debug(f"{engine_module} on query {num_q} results produced average recall of {recall}.")
            if prec < 0 or prec > 1:
                logging.error(
                    f"The average precision for {engine_module} is out of range [0,1]."
                )
            if p5 < 0 or p5 > 1:
                logging.error(
                    f"The average precision@5 for {engine_module} is out of range [0,1]."
                )
            if p5 < 0 or p5 > 1:
                logging.error(
                    f"The average precision@5 for {engine_module} is out of range [0,1]."
                )
            if p50 < 0 or p50 > 1:
                logging.error(
                    f"The average precision@50 for {engine_module} is out of range [0,1]."
                )
            if recall < 0 or recall > 1:
                logging.error(
                    f"The average recall for {engine_module} is out of range [0,1]."
                )
            precision_list.append(prec)
            precisiion_5.append(p5)
            precisiom_10.append(p10)
            presicion_50.append(p50)
            recall_list.append(recall)