def precision_at_k(k, num_relevant, query, search_type, operation_type, input=False, incorporate_pr="yes"): ''' :param k: number of retrieved documents :param query: :param search_type: :param operation_type: :param input: :param incorporate_pr: :return: out of the k retrieved documents, how many are relevant ''' retrieved = search(query, search_type, operation_type, type_input=input, incorporate_pr=incorporate_pr, verbose=False, num_results=k) # getting number of relevant documents relevant = util.get_nbest_results(query, num_relevant) print("{} relevant documents obtained".format(len(relevant))) # getting number of relevant documents that were retrieved relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant] num_relevant_retrieved = len(relevant_retrieved_docs) print("{} relevant documents retrieved".format(num_relevant_retrieved)) return num_relevant_retrieved / k
def ndcg(query, retrieved, test_set, num_relevant=100): # getting relevant documents if test_set == 'google': with open('google_results.json') as res_google: google_dict = json.load(res_google) relevant = google_dict[query] elif test_set == 'ucl': relevant = util.get_nbest_results(query, num_relevant) else: print('Not a valid dataset') exit(1) print("{} relevant documents obtained".format(len(relevant))) # defining the first 10 documents as highly relevant # (relvance 2), the remaining documents as relevant (relevance 1) # and all the rest are deemed irrelevant (relevance 0) # these values might have to be changed approprietely num_highly_relevant = 10 num_relevant = num_relevant - 10 highly_relevant = relevant[:num_highly_relevant] relevant = relevant[num_highly_relevant:] dcg = 0 rank = 1 for doc in retrieved: # find relevance of retrieved doc if doc in highly_relevant: rel_doc = 2 elif doc in relevant: rel_doc = 1 else: rel_doc = 0 # calculate discount based on rank of document if rank == 1: discount = 1 else: discount = log(rank, 2) # add discounted gain to DCG value dcg += rel_doc / discount # update rank rank += 1 # find number of retrieved documents by finding length of list (since it might be smaller than k) num_retrieved = len(retrieved) opt_dcg = calc_optimal_dcg(num_retrieved, num_highly_relevant, num_relevant) # divide by optimal value to get normalized DCG ndcg = dcg / opt_dcg return ndcg, len(relevant)
def precision_at_k(k, num_relevant, query, search_type, operation_type, incorporate_pr, test_set, input=False): ''' :param k: number of retrieved documents :param query: :param search_type: :param operation_type: :param input: :param incorporate_pr: :return: out of the k retrieved documents, how many are relevant ''' retrieved, run_time = search(query, search_type, operation_type, type_input=input, incorporate_pr=incorporate_pr, verbose=False, num_results=k) # getting relevant documents if test_set == 'google': with open('google_results.json') as res_google: google_dict = json.load(res_google) relevant = google_dict[query] elif test_set == 'ucl': relevant = util.get_nbest_results(query, num_relevant) else: print('Not a valid dataset') exit(1) print("{} relevant documents obtained".format(len(relevant))) # getting number of relevant documents that were retrieved relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant] #print(relevant_retrieved_docs) num_relevant_retrieved = len(relevant_retrieved_docs) #print("{} relevant documents retrieved".format(num_relevant_retrieved)) return num_relevant_retrieved / k, run_time, len(relevant)
def precision_at_k(k, query, retrieved, test_set, num_relevant=100): # getting relevant documents if test_set == 'google': with open('google_results.json') as res_google: google_dict = json.load(res_google) relevant = google_dict[query] elif test_set == 'ucl': relevant = util.get_nbest_results(query, num_relevant) else: print('Not a valid dataset') exit(1) print("{} relevant documents obtained".format(len(relevant))) # getting number of relevant documents that were retrieved relevant_retrieved_docs = [doc for doc in retrieved if doc in relevant] #print(relevant_retrieved_docs) num_relevant_retrieved = len(relevant_retrieved_docs) #print("{} relevant documents retrieved".format(num_relevant_retrieved)) return num_relevant_retrieved / k, len(relevant)
def ndcg_at_k(k, num_relevant, query, search_type, operation_type, input=False, incorporate_pr="yes"): ''' :param k: :param num_relevant: :param query: :param search_type: :param operation_type: :param input: :param incorporate_pr: :return: normalized DCG value at k ''' retrieved = search(query, search_type, operation_type, type_input=input, incorporate_pr=incorporate_pr, verbose=False, num_results=k) relevant = util.get_nbest_results(query, num_relevant) print("{} relevant documents obtained".format(len(relevant))) # defining the first 10 documents as highly relevant # (relvance 2), the remaining documents as relevant (relevance 1) # and all the rest are deemed irrelevant (relevance 0) # these values might have to be changed approprietely num_highly_relevant = 10 num_relevant = num_relevant - 10 highly_relevant = relevant[:num_highly_relevant] relevant = relevant[num_highly_relevant:] dcg = 0 rank = 1 for doc in retrieved: # find relevance of retrieved doc if doc in highly_relevant: rel_doc = 2 elif doc in relevant: rel_doc = 1 else: rel_doc = 0 # calculate discount based on rank of document if rank == 1: discount = 1 else: discount = log(rank, 2) # add discounted gain to DCG value dcg += rel_doc / discount # find number of retrieved documents by finding length of list (since it might be smaller than k) num_retrieved = len(retrieved) opt_dcg = calc_optimal_dcg(num_retrieved, num_highly_relevant, num_relevant) # divide by optimal value to get normalized DCG ndcg = dcg / opt_dcg return ndcg #print(precision_at_k(k=10,num_relevant=200,query="UCL",search_type="tfidf",operation_type="and"))