示例#1
0
def xQuAD(basepath,topicID,cutoff= 50):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected =[]  #docs ranking list
    
    out_file = open(basepath+"rank/"+topicID+"/xquad_result","w")
    runlist_file = open(basepath+"rank/"+topicID+"/runlist","w")
    print >>runlist_file,"xquad_result"
    runlist_file.close()
    documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight = data_preprocessing_xquad(basepath,topicID)

    sid_doc_related_probility = similarity_computing(documents,subtopicID_word,average_document_len,query,document_term_frequency)
   
    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%s-th best document."%i
        best_document = best_document_select(documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight,document_selected,sid_doc_related_probility)
        document_selected.append(best_document)
        if len(document_selected) == len(documents):
            # all document in ranking list
            break
        
    # print the ranking result to the evaluation files
    for index,document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index+1
        document_score = document.get_ranking_score()
        print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN"
    out_file.close()
        
    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath,topicID,10)
示例#2
0
def run_ranking(basepath,topics,sparse_parameters):    
    
    for topicID in topics:
        if topicID == "10001":continue
        runlist = open(basepath+"rank/"+topicID+"/runlist","w")
        sparse_base_ranking(basepath,topicID,sparse_parameters,runlist)
        runlist.close()
        r = call_eval_for_result(basepath,topicID,10) 
示例#3
0
def run_ranking(basepath, topics, sparse_parameters):

    for topicID in topics:
        if topicID == "10001": continue
        runlist = open(basepath + "rank/" + topicID + "/runlist", "w")
        sparse_base_ranking(basepath, topicID, sparse_parameters, runlist)
        runlist.close()
        r = call_eval_for_result(basepath, topicID, 10)
示例#4
0
def MMR_ranking(basepath, topicID, cutoff):
    ""
    docs, subtopicID_word, topic_words, document_term_frequency, average_document_len, word2id_weight = data_preprocessing_mmr(
        basepath, topicID)
    query = topic_words
    first_score = -1000.0
    first_doc_id = -1
    document_collection = []
    doc_selected = []
    #prepare the out files for evaluation
    out_file = open(basepath + "rank/" + topicID + "/mmr_result", "w")
    runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w")
    print >> runlist_file, "mmr_result"
    runlist_file.close()

    #select the first document by relevance
    for dindex, doc in enumerate(docs):
        sim = similarity(query, doc.get_term_vec(), document_term_frequency,
                         average_document_len, len(docs))
        sum_weight = 0.0
        vec = [0.0 for i in range(len(word2id_weight))]
        for word in doc.get_term_vec():
            wid = word2id_weight[word][0]
            vec[wid] = word2id_weight[word][1] * 1.0
            sum_weight += word2id_weight[word][1] * 1.0
        if sum_weight > 0:
            doc.set_tfidf_vec([vec[i] / sum_weight for i in range(len(vec))])
        else:
            doc.set_tfidf_vec(vec)
        #doc.set_ranking_score(0.0)
        if first_score < sim:
            first_doc_id = dindex
            first_score = sim
    docs[first_doc_id].set_ranking_score(sim)
    doc_selected.append(docs[first_doc_id])
    sim_dict = similar_documents(docs)

    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%2d-th document selecting for ranking" % (i + 1)
        doc_selected.append(select_best_documnt(docs, doc_selected, sim_dict))
        if len(doc_selected) == len(docs):
            # all document in ranking list
            break

    # print the ranking result to the evaluation files
    for index, document in enumerate(doc_selected):

        documentID = document.get_id()
        document_rank = index + 1
        document_score = document.get_ranking_score()
        print >> out_file, topicID + " Q0 " + documentID + " " + str(
            document_rank) + " " + str(document_score) + " TESTRUN"
    out_file.close()

    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath, topicID, 10)
示例#5
0
def MMR_ranking(basepath,topicID,cutoff):
    ""
    docs,subtopicID_word,topic_words,document_term_frequency,average_document_len,word2id_weight= data_preprocessing_mmr(basepath,topicID)
    query = topic_words
    first_score = -1000.0
    first_doc_id = -1
    document_collection = []
    doc_selected = []
    #prepare the out files for evaluation
    out_file = open(basepath+"rank/"+topicID+"/mmr_result","w")
    runlist_file = open(basepath+"rank/"+topicID+"/runlist","w")
    print >>runlist_file,"mmr_result"
    runlist_file.close()
    
    
    #select the first document by relevance
    for dindex,doc in enumerate(docs):
        sim = similarity(query,doc.get_term_vec(),document_term_frequency,average_document_len,len(docs))
        sum_weight = 0.0
        vec = [0.0 for i in range(len(word2id_weight))]
        for word in doc.get_term_vec():
            wid = word2id_weight[word][0]
            vec[wid] = word2id_weight[word][1]*1.0
            sum_weight += word2id_weight[word][1]*1.0
        if sum_weight>0:
            doc.set_tfidf_vec([vec[i]/sum_weight for i in range(len(vec))])
        else:
            doc.set_tfidf_vec(vec)
        #doc.set_ranking_score(0.0)
        if first_score< sim:
            first_doc_id = dindex
            first_score = sim
    docs[first_doc_id].set_ranking_score(sim)
    doc_selected.append(docs[first_doc_id])
    sim_dict = similar_documents(docs)
    
    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%2d-th document selecting for ranking"%(i+1)
        doc_selected.append(select_best_documnt(docs,doc_selected,sim_dict))
        if len(doc_selected) == len(docs):
            # all document in ranking list
            break
        
    # print the ranking result to the evaluation files
    for index,document in enumerate(doc_selected):
       
        documentID = document.get_id()
        document_rank = index+1
        document_score = document.get_ranking_score()
        print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN"
    out_file.close()
    
    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath,topicID,10)    
示例#6
0
def run_ranking(basepath, topics, methods):

    for topicID in topics:
        for method in methods:
            if method == "mine":
                filename1 = "xquad_mine_less"
                filename2 = "xquad_mine_all"
            else:
                filename1 = "xquad_standard_less"
                filename2 = "xquad_standard_all"

            xQuAD(basepath, topicID, filename1, 30, method)
            xQuAD(basepath, topicID, filename2, 30, method)
        r = call_eval_for_result(basepath, topicID, 10)
示例#7
0
def run_ranking(basepath,topics,methods):    
    
    for topicID in topics:
        for method in methods:
            if method =="mine":
                filename1= "xquad_mine_less"
                filename2 = "xquad_mine_all"
            else:
                filename1= "xquad_standard_less"
                filename2 = "xquad_standard_all"
            
            xQuAD(basepath,topicID,filename1,30,method)
            xQuAD(basepath,topicID,filename2,30,method)
        r = call_eval_for_result(basepath,topicID,10)     
示例#8
0
def xQuAD(basepath, topicID, cutoff=50):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected = []  #docs ranking list

    out_file = open(basepath + "rank/" + topicID + "/xquad_result", "w")
    runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w")
    print >> runlist_file, "xquad_result"
    runlist_file.close()
    documents, subtopicID_word, query, document_term_frequency, average_document_len, word2id_weight = data_preprocessing_xquad(
        basepath, topicID)

    sid_doc_related_probility = similarity_computing(documents,
                                                     subtopicID_word,
                                                     average_document_len,
                                                     query,
                                                     document_term_frequency)

    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%s-th best document." % i
        best_document = best_document_select(documents, subtopicID_word, query,
                                             document_term_frequency,
                                             average_document_len,
                                             word2id_weight, document_selected,
                                             sid_doc_related_probility)
        document_selected.append(best_document)
        if len(document_selected) == len(documents):
            # all document in ranking list
            break

    # print the ranking result to the evaluation files
    for index, document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index + 1
        document_score = document.get_ranking_score()
        print >> out_file, topicID + " Q0 " + documentID + " " + str(
            document_rank) + " " + str(document_score) + " TESTRUN"
    out_file.close()

    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath, topicID, 10)
#-*- coding:utf8 -*-
import os
import sys
sys.path.append("..")
from result.result_analysis import DR_result_analysis
from util.evaluation import call_eval_for_result





if __name__=="__main__":
    print "start..."
    files = os.listdir("/users/songwei/xuwenbin/diversification/ntcir09/rank/")
    for file_name in files:
        out1 = open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/runlist","w")
        out = open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/ideal","w")
        print >>out1,"ideal"
        with open("/users/songwei/xuwenbin/diversification/ntcir09/rank/"+file_name+"/ideal.rank") as lines:
            for line in lines:
                if line.startswith("0"):
                    print>>out,line.replace("\n","")
        out.close()
        out1.close()
        call_eval_for_result("/users/songwei/xuwenbin/diversification/ntcir09/",file_name,10) 
    DR_result_analysis("/users/songwei/xuwenbin/diversification/ntcir09/","","ideal")