Пример #1
0
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
Пример #2
0
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
Пример #3
0
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [
                harvest(Y, qid, docid, replacement) for docid in result
            ]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible.".format(
                    np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values
Пример #4
0
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [harvest(Y, qid, docid, replacement)
                             for docid in result]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible."
                      .format(np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values