def generate_reward(gold_index_list, answer_index_list):
    reward = 0
    ap = 0
    reciprocal_rank = 0
    answer_list = list(answer_index_list)
    size = len(answer_index_list)
    true = sum(gold_index_list > 0)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = val
    maxk = sum(inp > 0)
    if true:
        ap = average_precision(inp) * (maxk / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    ndcg = ndcg_at_k(inp, min(10, size))
    dcg_five = dcg_at_k(inp, 5)
    reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4
    ranks = [1, 3, 5, 10]
    reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five]
    for r in ranks:
        reward_tuple.append(precision_at_k(inp, min(r, len(inp))))
    for r in ranks:
        reward_tuple.append(ndcg_at_k(inp, min(r, len(inp))))
    return reward_tuple
示例#2
0
def evalResults(results, trueRelevance, noveltyList, trainModelIDs, rev_dict,
                uid, alg, params, rec, outFile, diversity, novelty):
    params = [str(i) for i in params]
    #calculate rating precision
    mmScaler = MinMaxScaler(copy=True)
    results = mmScaler.fit_transform(results.reshape(-1, 1))
    results = results.reshape((-1, ))
    r2Sc = r2_score(trueRelevance, results)
    mae = mean_absolute_error(trueRelevance, results)

    #calculate ranking scores
    idx = (-results).argsort()

    if diversity == "yes":
        reranked = mmr_sorted(range(len(results)), 0.8, results, rev_dict, 10)
        idx1 = [k for k, v in reranked.items()]
        idx2 = [i for i in idx if i not in idx1]
        idx1.extend(idx2)
        idx = idx1

    rankedRelevance = trueRelevance[idx]
    rankedNovelty = noveltyList[idx]

    #print(rankedRelevance)

    map = rank_metrics.average_precision(rankedRelevance)
    aucSc = roc_auc_score(trueRelevance, results)
    nDCG10 = rank_metrics.ndcg_at_k(rankedRelevance, 10)
    nDCG100 = rank_metrics.ndcg_at_k(rankedRelevance, 100)
    nDCG = rank_metrics.ndcg_at_k(rankedRelevance, len(rankedRelevance))

    p5 = prec_at_n(rankedRelevance, 5)
    r5 = rec_at_n(rankedRelevance, 5)
    n5 = meanNovelty_at_n(rankedNovelty, 5)
    un5 = user_novelty_at_n(idx, trainModelIDs, 5)
    ild5 = ild_at_n(idx, rev_dict, 5)
    p10 = prec_at_n(rankedRelevance, 10)
    r10 = rec_at_n(rankedRelevance, 10)
    n10 = meanNovelty_at_n(rankedNovelty, 10)
    ild10 = ild_at_n(idx, rev_dict, 10)
    un10 = user_novelty_at_n(idx, trainModelIDs, 10)

    mrr = rank_metrics.mean_reciprocal_rank([rankedRelevance])

    #print((uid, alg, ",".join(params), rec, r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG))

    txt = "%s;%s;%s;%s;%s;%s;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f;%.6f\n" % (
        uid, alg, ",".join(params), rec, diversity, novelty, r2Sc, mae, map,
        aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100, nDCG, n5, n10, un5,
        un10, ild5, ild10)
    outFile.write(txt)
    return (r2Sc, mae, map, aucSc, mrr, p5, p10, r5, r10, nDCG10, nDCG100,
            nDCG, n5, n10, ild5, ild10)
示例#3
0
def generate_reward(gold_index_list, answer_index_list, reward_type):
    reward = 0
    ap = 0.
    reciprocal_rank = 0
    answer_list = list(answer_index_list)
    size = len(answer_index_list)
    true = sum(gold_index_list > 0)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = val
    maxk = sum(inp > 0)
    if true:
        ap = average_precision(inp) * (maxk / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    ndcg = ndcg_at_k(inp, min(10, size))
    dcg_five = dcg_at_k(inp, 5)
    reward = rewards[reward_type - 1](inp, ap, reciprocal_rank, ndcg, dcg_five)
    return reward, ap, reciprocal_rank, ndcg, dcg_five
示例#4
0
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
示例#5
0
 def summarize(self):
     """Give summary statistics about the tournament."""
     res = self.run()
     # res = self.results
     # champ should be undefeated
     champ = list(np.where(res.strength == max(res.strength))[0])
     copeland = (res.wins[champ] == self.n_rounds)
     # top-k
     ranks = pd.DataFrame(data=np.transpose([
         res.strength.rank(ascending=False),
         res.wins.rank(ascending=False), res.wins
     ]),
                          columns=["str_rank", "win_rank", "wins"])
     ranks['relevant'] = ranks['str_rank'] <= self.k
     borda = (ranks.win_rank[champ] == ranks.win_rank.min())
     top_k_df = ranks.loc[ranks['str_rank'] <= self.k]
     top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k
     tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank)
     rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank)
     ranks.sort_values(by="win_rank")
     # using rank_metrics
     rel_vec = ranks.relevant.values
     prec = rank_metrics.r_precision(rel_vec)
     prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k)
     avg_prec = rank_metrics.average_precision(rel_vec)
     dcg = rank_metrics.dcg_at_k(rel_vec, self.k)
     ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k)
     df = pd.DataFrame(data=[
         list([
             int(copeland),
             int(borda),
             float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg,
             float(tau),
             float(rho)
         ])
     ],
                       columns=[
                           'undef_champ', 'top_champ', 'top_k_found',
                           'precision', 'precision_at_k', 'avg_prec', 'dcg',
                           'ndcg', 'tau', 'rho'
                       ])
     return df
示例#6
0
文件: base.py 项目: shatha2014/vec4ir
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
示例#7
0
def generate_reward(gold_index_list, answer_index_list, reward_type=1):
    reward = 0
    ap = 0
    reciprocal_rank = 0
    answer_list = list(deepcopy(answer_index_list))
    size = len(answer_index_list)
    true = sum(gold_index_list)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = 2
    if true:
        ap = average_precision(inp) * (sum(inp > 0) / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    #ndcg = ndcg_at_k(inp,size)
    #if reward_type==1:
    #    reward = (ap+reciprocal_rank)/2
    #elif reward_type ==2 :
    #    reward = dcg_at_k(inp,size)
    rewards = [(ap + reciprocal_rank) / 2, dcg_at_k(inp, size)]
    return rewards[reward_type - 1], ap, reciprocal_rank, (inp[0] > 0)
示例#8
0
def compute_metrics(ranked_judgements, pr_atk, threshold_grade):
    """
    Given the ranked judgements compute the metrics for a query.
    :param ranked_judgements: list(int); graded or binary relevances in rank order.
    :param pr_atk: int; the @K value to use for computing precision and recall.
    :param threshold_grade: int; Assuming 0-3 graded relevances, threshold at some point
        and convert graded to binary relevance.
    :return:
    """
    graded_judgements = ranked_judgements
    ranked_judgements = [
        1 if rel >= threshold_grade else 0 for rel in graded_judgements
    ]
    # Use the full set of candidate not the pr_atk.
    ndcg = rm.ndcg_at_k(graded_judgements, len(ranked_judgements))
    ndcg_pr = rm.ndcg_at_k(graded_judgements,
                           int(0.20 * len(ranked_judgements)))
    ndcg_20 = rm.ndcg_at_k(graded_judgements, 20)
    max_total_relevant = sum(ranked_judgements)
    recall = recall_at_k(ranked_rel=ranked_judgements,
                         atk=pr_atk,
                         max_total_relevant=max_total_relevant)
    precision = rm.precision_at_k(r=ranked_judgements, k=pr_atk)
    r_precision = rm.r_precision(r=ranked_judgements)
    f1 = 2 * precision * recall / (precision + recall) if (precision +
                                                           recall) > 0 else 0.0
    av_precision = rm.average_precision(r=ranked_judgements)
    reciprocal_rank = rm.mean_reciprocal_rank(rs=[ranked_judgements])
    metrics = {
        'recall': float(recall),
        'precision': float(precision),
        'f1': float(f1),
        'r_precision': float(r_precision),
        'av_precision': float(av_precision),
        'reciprocal_rank': float(reciprocal_rank),
        'ndcg': ndcg,
        'ndcg@20': ndcg_20,
        'ndcg%20': ndcg_pr
    }
    return metrics
示例#9
0
    def evaluate(self, ratings: Dict[int, List[int]], negatives: Dict[int, List[int]], topN: int):
        """
        evaluate performance of models
        :param ratings: key: user, value: list of positive items
        :param negatives: key: user, value: list of negative items
        :param topN: int
        :return:
        """
        ndcgs, apks, recalls = [], [], []
        for user in sorted(ratings.keys()):
            pos_items = ratings[user]
            neg_items = negatives[user]
            assert type(pos_items) == list and type(neg_items) == list

            items = neg_items + pos_items
            users = np.full(len(items), user, dtype=np.int64)
            items = np.asarray(items)
            predictions = self.predict(users, items)
            labels = [0.0] * len(neg_items) + [1.0] * len(pos_items)
            labels = np.array(labels)
            # compute metric here

            indices = np.argsort(-predictions)[:topN]  # indices of items with highest scores
            ranklist = labels[indices]
            ndcg = rank_metrics.ndcg_at_k(ranklist, topN)
            _, recall = rank_metrics._compute_precision_recall(ranklist, topN)
            apk = rank_metrics.average_precision(ranklist[:topN])
            ndcgs.append(ndcg)
            apks.append(apk)
            recalls.append(recall)

        results = {}
        results["ndcg"] = np.nanmean(ndcgs)
        results["ndcg_list"] = ndcgs
        results["map"] = np.nanmean(apks)
        results["maps_list"] = apks
        results["recall"] = np.nanmean(recalls)
        results["recalls_list"] = recalls

        return results
示例#10
0
            #a=a[0:np.int(cutter/10)]
            #score=score[0:np.int(cutter/10)]



            list1=next(r)
            filter_object = filter(lambda x: x != "", list1)
            list1 = list(filter_object)
            list1=np.unique(np.array(list1, dtype=int))
            binary=np.isin(a, list1).astype(int)
            den= np.argwhere(binary==1)
            if np.array(den).size<1:
                mrr=0
            else:
                mrr=(1/(den[0]+1))
            map=rm.average_precision(binary)
            top1=0
            top5=0
            top10=0
            top20=0
            top100=0
            if 1 in binary[:100]:
                top100=1
            if 1 in binary[:20]:
                top20=1
            if 1 in binary[:10]:
                top10=1
            if 1 in binary[:5]:
                top5=1
            if 1 in binary[:1]:
                top1=1
示例#11
0
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [
                harvest(Y, qid, docid, replacement) for docid in result
            ]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible.".format(
                    np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values
示例#12
0
            r = []
            for item in sim_rank:
                r.append(eval_query[ent][item[0]])
            if len(r) >1:
                tmp_n1 = rm.ndcg_at_k(r, 1, 1)
            else:
                tmp_n1 = rm.ndcg_at_k(r, len(r), 1)
            if len(r) >5:
                tmp_n5 = rm.ndcg_at_k(r, 5, 1)
            else:
                tmp_n5 = rm.ndcg_at_k(r, len(r), 1)
            if len(r) >10:
                tmp_n10 = rm.ndcg_at_k(r, 10, 1)
            else:
                tmp_n10 = rm.ndcg_at_k(r, len(r), 1)
            tmp_ap = rm.average_precision(r)
            ndcg1_sum += tmp_n1
            ndcg5_sum += tmp_n5
            ndcg10_sum += tmp_n10
            map_sum += tmp_ap
            can_count += tmp_can_count
        else:
            ent_skip_count +=1
act_ent_count = len(eval_query)-ent_skip_count

with codecs.open(log_file, 'a', encoding='UTF-8') as fout_log:
    fout_log.write("**********************************\n")
    fout_log.write("eval %d(%d) entities with %d(%d) candidate entities for %s!\n" % (act_ent_count,len(eval_query),can_count/act_ent_count,relatedness_pair_num/len(eval_query), entity_vec_file))
    fout_log.write("ndcg1 : %f, ndcg5 : %f, ndcg10 : %f, map : %f\n" % (float(ndcg1_sum/act_ent_count),float(ndcg5_sum/act_ent_count),float(ndcg10_sum/act_ent_count),float(map_sum/act_ent_count)))
    fout_log.write("**********************************\n")
示例#13
0
        #scr[i, :] = 0.5 * (np.maximum(scr1, 0) ** 0.5 + np.maximum(scr2, 0) ** 0.5)
        #scr[i, :] = 0.5 * (np.maximum(scr1, 0) + np.maximum(scr2, 0))
        scr[i, :] = 0.5 * (scr1 + scr2)

    # --------------------------------------------------------------------------

    print("computing tag-centric scores ..")
    ap = []
    for i, tag in enumerate(tags):
        # rank images
        idxs = np.argsort(scr[:, i])[::-1]

        # compute AP(tag)
        relevant = [im for im in tag2im[tag] if im in images_test]
        r = [int(images_test[j] in relevant) for j in idxs]
        ap.append(average_precision(r))

        print("  {} {:.2f}".format(tag, 100 * ap[-1]))

    print("done")

    # --------------------------------------------------------------------------

    print("computing image-centric scores ..")
    iap = []
    for i, im in enumerate(images_test):
        # rank tags
        idxs = np.argsort(scr[i, :])[::-1]

        # compute AP(image)
        relevant = list(im2tag[im])
示例#14
0
文件: base.py 项目: shatha2014/vec4ir
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [harvest(Y, qid, docid, replacement)
                             for docid in result]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible."
                      .format(np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values