def xQuAD(basepath,topicID,cutoff= 50): """ return a ranking list using the framework of xQuAD. """ document_selected =[] #docs ranking list out_file = open(basepath+"rank/"+topicID+"/xquad_result","w") runlist_file = open(basepath+"rank/"+topicID+"/runlist","w") print >>runlist_file,"xquad_result" runlist_file.close() documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight = data_preprocessing_xquad(basepath,topicID) sid_doc_related_probility = similarity_computing(documents,subtopicID_word,average_document_len,query,document_term_frequency) #iteratively selected the best document to build ranking list for i in range(cutoff): print "%s-th best document."%i best_document = best_document_select(documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight,document_selected,sid_doc_related_probility) document_selected.append(best_document) if len(document_selected) == len(documents): # all document in ranking list break # print the ranking result to the evaluation files for index,document in enumerate(document_selected): documentID = document.get_id() document_rank = index+1 document_score = document.get_ranking_score() print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN" out_file.close() # call the evaluation function to evaluate the run result r = call_eval_for_result(basepath,topicID,10)
def run_ranking(basepath,topics,sparse_parameters): for topicID in topics: if topicID == "10001":continue runlist = open(basepath+"rank/"+topicID+"/runlist","w") sparse_base_ranking(basepath,topicID,sparse_parameters,runlist) runlist.close() r = call_eval_for_result(basepath,topicID,10)
def run_ranking(basepath, topics, sparse_parameters): for topicID in topics: if topicID == "10001": continue runlist = open(basepath + "rank/" + topicID + "/runlist", "w") sparse_base_ranking(basepath, topicID, sparse_parameters, runlist) runlist.close() r = call_eval_for_result(basepath, topicID, 10)
def MMR_ranking(basepath, topicID, cutoff): "" docs, subtopicID_word, topic_words, document_term_frequency, average_document_len, word2id_weight = data_preprocessing_mmr( basepath, topicID) query = topic_words first_score = -1000.0 first_doc_id = -1 document_collection = [] doc_selected = [] #prepare the out files for evaluation out_file = open(basepath + "rank/" + topicID + "/mmr_result", "w") runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w") print >> runlist_file, "mmr_result" runlist_file.close() #select the first document by relevance for dindex, doc in enumerate(docs): sim = similarity(query, doc.get_term_vec(), document_term_frequency, average_document_len, len(docs)) sum_weight = 0.0 vec = [0.0 for i in range(len(word2id_weight))] for word in doc.get_term_vec(): wid = word2id_weight[word][0] vec[wid] = word2id_weight[word][1] * 1.0 sum_weight += word2id_weight[word][1] * 1.0 if sum_weight > 0: doc.set_tfidf_vec([vec[i] / sum_weight for i in range(len(vec))]) else: doc.set_tfidf_vec(vec) #doc.set_ranking_score(0.0) if first_score < sim: first_doc_id = dindex first_score = sim docs[first_doc_id].set_ranking_score(sim) doc_selected.append(docs[first_doc_id]) sim_dict = similar_documents(docs) #iteratively selected the best document to build ranking list for i in range(cutoff): print "%2d-th document selecting for ranking" % (i + 1) doc_selected.append(select_best_documnt(docs, doc_selected, sim_dict)) if len(doc_selected) == len(docs): # all document in ranking list break # print the ranking result to the evaluation files for index, document in enumerate(doc_selected): documentID = document.get_id() document_rank = index + 1 document_score = document.get_ranking_score() print >> out_file, topicID + " Q0 " + documentID + " " + str( document_rank) + " " + str(document_score) + " TESTRUN" out_file.close() # call the evaluation function to evaluate the run result r = call_eval_for_result(basepath, topicID, 10)
def MMR_ranking(basepath,topicID,cutoff): "" docs,subtopicID_word,topic_words,document_term_frequency,average_document_len,word2id_weight= data_preprocessing_mmr(basepath,topicID) query = topic_words first_score = -1000.0 first_doc_id = -1 document_collection = [] doc_selected = [] #prepare the out files for evaluation out_file = open(basepath+"rank/"+topicID+"/mmr_result","w") runlist_file = open(basepath+"rank/"+topicID+"/runlist","w") print >>runlist_file,"mmr_result" runlist_file.close() #select the first document by relevance for dindex,doc in enumerate(docs): sim = similarity(query,doc.get_term_vec(),document_term_frequency,average_document_len,len(docs)) sum_weight = 0.0 vec = [0.0 for i in range(len(word2id_weight))] for word in doc.get_term_vec(): wid = word2id_weight[word][0] vec[wid] = word2id_weight[word][1]*1.0 sum_weight += word2id_weight[word][1]*1.0 if sum_weight>0: doc.set_tfidf_vec([vec[i]/sum_weight for i in range(len(vec))]) else: doc.set_tfidf_vec(vec) #doc.set_ranking_score(0.0) if first_score< sim: first_doc_id = dindex first_score = sim docs[first_doc_id].set_ranking_score(sim) doc_selected.append(docs[first_doc_id]) sim_dict = similar_documents(docs) #iteratively selected the best document to build ranking list for i in range(cutoff): print "%2d-th document selecting for ranking"%(i+1) doc_selected.append(select_best_documnt(docs,doc_selected,sim_dict)) if len(doc_selected) == len(docs): # all document in ranking list break # print the ranking result to the evaluation files for index,document in enumerate(doc_selected): documentID = document.get_id() document_rank = index+1 document_score = document.get_ranking_score() print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN" out_file.close() # call the evaluation function to evaluate the run result r = call_eval_for_result(basepath,topicID,10)
def run_ranking(basepath, topics, methods): for topicID in topics: for method in methods: if method == "mine": filename1 = "xquad_mine_less" filename2 = "xquad_mine_all" else: filename1 = "xquad_standard_less" filename2 = "xquad_standard_all" xQuAD(basepath, topicID, filename1, 30, method) xQuAD(basepath, topicID, filename2, 30, method) r = call_eval_for_result(basepath, topicID, 10)
def run_ranking(basepath,topics,methods): for topicID in topics: for method in methods: if method =="mine": filename1= "xquad_mine_less" filename2 = "xquad_mine_all" else: filename1= "xquad_standard_less" filename2 = "xquad_standard_all" xQuAD(basepath,topicID,filename1,30,method) xQuAD(basepath,topicID,filename2,30,method) r = call_eval_for_result(basepath,topicID,10)
def xQuAD(basepath, topicID, cutoff=50): """ return a ranking list using the framework of xQuAD. """ document_selected = [] #docs ranking list out_file = open(basepath + "rank/" + topicID + "/xquad_result", "w") runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w") print >> runlist_file, "xquad_result" runlist_file.close() documents, subtopicID_word, query, document_term_frequency, average_document_len, word2id_weight = data_preprocessing_xquad( basepath, topicID) sid_doc_related_probility = similarity_computing(documents, subtopicID_word, average_document_len, query, document_term_frequency) #iteratively selected the best document to build ranking list for i in range(cutoff): print "%s-th best document." % i best_document = best_document_select(documents, subtopicID_word, query, document_term_frequency, average_document_len, word2id_weight, document_selected, sid_doc_related_probility) document_selected.append(best_document) if len(document_selected) == len(documents): # all document in ranking list break # print the ranking result to the evaluation files for index, document in enumerate(document_selected): documentID = document.get_id() document_rank = index + 1 document_score = document.get_ranking_score() print >> out_file, topicID + " Q0 " + documentID + " " + str( document_rank) + " " + str(document_score) + " TESTRUN" out_file.close() # call the evaluation function to evaluate the run result r = call_eval_for_result(basepath, topicID, 10)
#-*- coding:utf8 -*- import os import sys sys.path.append("..") from result.result_analysis import DR_result_analysis from util.evaluation import call_eval_for_result if __name__=="__main__": print "start..." files = os.listdir("/users/songwei/xuwenbin/diversification/ntcir09/rank/") for file_name in files: out1 = open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/runlist","w") out = open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/ideal","w") print >>out1,"ideal" with open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/ideal.rank") as lines: for line in lines: if line.startswith("0"): print>>out,line.replace("\n","") out.close() out1.close() call_eval_for_result("/users/songwei/xuwenbin/diversification/ntcir09/",file_name,10) DR_result_analysis("/users/songwei/xuwenbin/diversification/ntcir09/","","ideal")