Пример #1
0
def precision_at_k(k,
                   num_relevant,
                   query,
                   search_type,
                   operation_type,
                   input=False,
                   incorporate_pr="yes"):
    '''
    :param k: number of retrieved documents
    :param query:
    :param search_type:
    :param operation_type:
    :param input:
    :param incorporate_pr:
    :return: out of the k retrieved documents, how many are relevant
    '''
    retrieved = search(query,
                       search_type,
                       operation_type,
                       type_input=input,
                       incorporate_pr=incorporate_pr,
                       verbose=False,
                       num_results=k)

    # getting number of relevant documents
    relevant = util.get_nbest_results(query, num_relevant)
    print("{} relevant documents obtained".format(len(relevant)))

    # getting number of relevant documents that were retrieved
    relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant]
    num_relevant_retrieved = len(relevant_retrieved_docs)
    print("{} relevant documents retrieved".format(num_relevant_retrieved))

    return num_relevant_retrieved / k
def ndcg(query, retrieved, test_set, num_relevant=100):

    # getting relevant documents
    if test_set == 'google':
        with open('google_results.json') as res_google:
            google_dict = json.load(res_google)
            relevant = google_dict[query]
    elif test_set == 'ucl':
        relevant = util.get_nbest_results(query, num_relevant)
    else:
        print('Not a valid dataset')
        exit(1)

    print("{} relevant documents obtained".format(len(relevant)))

    # defining the first 10 documents as highly relevant
    # (relvance 2), the remaining documents as relevant (relevance 1)
    # and all the rest are deemed irrelevant (relevance 0)
    # these values might have to be changed approprietely
    num_highly_relevant = 10
    num_relevant = num_relevant - 10

    highly_relevant = relevant[:num_highly_relevant]
    relevant = relevant[num_highly_relevant:]

    dcg = 0
    rank = 1
    for doc in retrieved:
        # find relevance of retrieved doc
        if doc in highly_relevant:
            rel_doc = 2
        elif doc in relevant:
            rel_doc = 1
        else:
            rel_doc = 0

        # calculate discount based on rank of document
        if rank == 1:
            discount = 1
        else:
            discount = log(rank, 2)

        # add discounted gain to DCG value
        dcg += rel_doc / discount

        # update rank
        rank += 1

    # find number of retrieved documents by finding length of list (since it might be smaller than k)
    num_retrieved = len(retrieved)
    opt_dcg = calc_optimal_dcg(num_retrieved, num_highly_relevant,
                               num_relevant)

    # divide by optimal value to get normalized DCG
    ndcg = dcg / opt_dcg

    return ndcg, len(relevant)
Пример #3
0
def precision_at_k(k,
                   num_relevant,
                   query,
                   search_type,
                   operation_type,
                   incorporate_pr,
                   test_set,
                   input=False):
    '''
    :param k: number of retrieved documents
    :param query:
    :param search_type:
    :param operation_type:
    :param input:
    :param incorporate_pr:
    :return: out of the k retrieved documents, how many are relevant
    '''
    retrieved, run_time = search(query,
                                 search_type,
                                 operation_type,
                                 type_input=input,
                                 incorporate_pr=incorporate_pr,
                                 verbose=False,
                                 num_results=k)

    # getting relevant documents
    if test_set == 'google':
        with open('google_results.json') as res_google:
            google_dict = json.load(res_google)
            relevant = google_dict[query]
    elif test_set == 'ucl':
        relevant = util.get_nbest_results(query, num_relevant)
    else:
        print('Not a valid dataset')
        exit(1)

    print("{} relevant documents obtained".format(len(relevant)))

    # getting number of relevant documents that were retrieved
    relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant]
    #print(relevant_retrieved_docs)
    num_relevant_retrieved = len(relevant_retrieved_docs)
    #print("{} relevant documents retrieved".format(num_relevant_retrieved))

    return num_relevant_retrieved / k, run_time, len(relevant)
def precision_at_k(k, query, retrieved, test_set, num_relevant=100):

    # getting relevant documents
    if test_set == 'google':
        with open('google_results.json') as res_google:
            google_dict = json.load(res_google)
            relevant = google_dict[query]
    elif test_set == 'ucl':
        relevant = util.get_nbest_results(query, num_relevant)
    else:
        print('Not a valid dataset')
        exit(1)

    print("{} relevant documents obtained".format(len(relevant)))

    # getting number of relevant documents that were retrieved
    relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant]
    #print(relevant_retrieved_docs)
    num_relevant_retrieved = len(relevant_retrieved_docs)
    #print("{} relevant documents retrieved".format(num_relevant_retrieved))

    return num_relevant_retrieved / k, len(relevant)
Пример #5
0
def ndcg_at_k(k,
              num_relevant,
              query,
              search_type,
              operation_type,
              input=False,
              incorporate_pr="yes"):
    '''

    :param k:
    :param num_relevant:
    :param query:
    :param search_type:
    :param operation_type:
    :param input:
    :param incorporate_pr:
    :return: normalized DCG value at k
    '''

    retrieved = search(query,
                       search_type,
                       operation_type,
                       type_input=input,
                       incorporate_pr=incorporate_pr,
                       verbose=False,
                       num_results=k)

    relevant = util.get_nbest_results(query, num_relevant)
    print("{} relevant documents obtained".format(len(relevant)))

    # defining the first 10 documents as highly relevant
    # (relvance 2), the remaining documents as relevant (relevance 1)
    # and all the rest are deemed irrelevant (relevance 0)
    # these values might have to be changed approprietely
    num_highly_relevant = 10
    num_relevant = num_relevant - 10

    highly_relevant = relevant[:num_highly_relevant]
    relevant = relevant[num_highly_relevant:]

    dcg = 0
    rank = 1
    for doc in retrieved:
        # find relevance of retrieved doc
        if doc in highly_relevant:
            rel_doc = 2
        elif doc in relevant:
            rel_doc = 1
        else:
            rel_doc = 0

        # calculate discount based on rank of document
        if rank == 1:
            discount = 1
        else:
            discount = log(rank, 2)

        # add discounted gain to DCG value
        dcg += rel_doc / discount

    # find number of retrieved documents by finding length of list (since it might be smaller than k)
    num_retrieved = len(retrieved)
    opt_dcg = calc_optimal_dcg(num_retrieved, num_highly_relevant,
                               num_relevant)

    # divide by optimal value to get normalized DCG
    ndcg = dcg / opt_dcg

    return ndcg


#print(precision_at_k(k=10,num_relevant=200,query="UCL",search_type="tfidf",operation_type="and"))