Python data_preprocessing_xquad示例，data_prepare.data_preparing.data_preprocessing_xquad Python示例

示例#1

0

显示文件

def xQuAD(basepath, topicID, filename, cutoff=50, method="mine"):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected = []

    out_file = open(basepath + "rank/" + topicID + "/" + filename, "w")
    runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w")
    print >> runlist_file, "xquad_mine_less"
    print >> runlist_file, "xquad_mine_all"
    print >> runlist_file, "xquad_standard_less"
    print >> runlist_file, "xquad_standard_all"
    runlist_file.close()

    documents, subtopicID_word, query_prob, document_term_frequency, average_document_len, word2id_weight, topic_words = data_preprocessing_xquad(
        basepath, topicID, method)
    subtopic_word = {}
    keys = subtopicID_word.keys()
    keys.sort()
    for key in keys:
        if filename == "xquad_mine_less" or filename == "xquad_standard_less":
            if key == keys[-1]: continue
            if key == keys[-2]: continue
#         if key == 3:continue
        if not subtopic_word.has_key(key):
            subtopic_word[key] = subtopicID_word[key]
#     print len(subtopicID_word.keys()),len(subtopic_word.keys())
    if filename == "xquad_mine_less" or filename == "xquad_standard_less":
        prob_sum = 0.0
        for key in query_prob.keys():
            if key == keys[-1]:
                query_prob[key] = 0
            elif key == keys[-2] and len(query_prob.keys()) > 3:
                query_prob[key] = 0
            else:
                prob_sum += query_prob[key]
        for key in query_prob.keys():
            #             print key,query_prob[key]
            query_prob[key] /= prob_sum

    document_subtopic_rel, rel_subtopic_ave = document_subtopic_relevance(
        subtopic_word, documents, average_document_len, topic_words,
        document_term_frequency)
    filem = open("middle/" + topicID + "div", "w")
    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print topicID, "%2d-th best document." % (i + 1)
        best_document = select_best_document(document_subtopic_rel,
                                             rel_subtopic_ave, query_prob,
                                             subtopic_word, document_selected,
                                             filem)
        document_selected.append(best_document)
        if len(document_selected) == len(documents) - 1:
            # all document in ranking list
            break

    # print the ranking result to the evaluation files
    for index, document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index + 1
        document_score = document.get_ranking_score()
        print >> out_file, topicID + " Q0 " + documentID + " " + str(
            document_rank) + " " + str(document_score) + " TESTRUN"
    out_file.close()

示例#2

0

显示文件

文件： xquad_original.py 项目： RainNo1/InfoSystem

def xQuAD(basepath,topicID,cutoff= 50):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected =[]  #docs ranking list
    
    out_file = open(basepath+"rank/"+topicID+"/xquad_result","w")
    runlist_file = open(basepath+"rank/"+topicID+"/runlist","w")
    print >>runlist_file,"xquad_result"
    runlist_file.close()
    documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight = data_preprocessing_xquad(basepath,topicID)

    sid_doc_related_probility = similarity_computing(documents,subtopicID_word,average_document_len,query,document_term_frequency)
   
    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%s-th best document."%i
        best_document = best_document_select(documents,subtopicID_word,query,document_term_frequency,average_document_len,word2id_weight,document_selected,sid_doc_related_probility)
        document_selected.append(best_document)
        if len(document_selected) == len(documents):
            # all document in ranking list
            break
        
    # print the ranking result to the evaluation files
    for index,document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index+1
        document_score = document.get_ranking_score()
        print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN"
    out_file.close()
        
    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath,topicID,10)

示例#3

0

显示文件

文件： xquad.py 项目： RainNo1/InfoSystem

def xQuAD(basepath,topicID,filename,cutoff= 50,method="mine"):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected =[]  
    
    out_file = open(basepath+"rank/"+topicID+"/"+filename,"w")
    runlist_file = open(basepath+"rank/"+topicID+"/runlist","w")
    print >>runlist_file,"xquad_mine_less"
    print >>runlist_file,"xquad_mine_all"
    print >>runlist_file,"xquad_standard_less"
    print >>runlist_file,"xquad_standard_all"
    runlist_file.close()

    documents,subtopicID_word,query_prob,document_term_frequency,average_document_len,word2id_weight,topic_words = data_preprocessing_xquad(basepath,topicID,method)
    subtopic_word={}
    keys = subtopicID_word.keys()
    keys.sort()
    for key in keys:
        if filename == "xquad_mine_less" or filename =="xquad_standard_less":
            if key==keys[-1]:continue
            if key==keys[-2]:continue
#         if key == 3:continue
        if not subtopic_word.has_key(key):
            subtopic_word[key] = subtopicID_word[key]
#     print len(subtopicID_word.keys()),len(subtopic_word.keys())
    if filename == "xquad_mine_less" or filename =="xquad_standard_less": 
        prob_sum = 0.0
        for key in query_prob.keys():
            if key == keys[-1]:
                query_prob[key]=0
            elif key == keys[-2] and len(query_prob.keys())>3:
                query_prob[key]=0
            else:
                prob_sum+=query_prob[key]
        for key in query_prob.keys():
#             print key,query_prob[key]
            query_prob[key] /= prob_sum
            
    document_subtopic_rel,rel_subtopic_ave = document_subtopic_relevance(subtopic_word,documents,average_document_len,topic_words,document_term_frequency)
    filem= open("middle/"+topicID+"div","w")
    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print topicID,"%2d-th best document."%(i+1)
        best_document =  select_best_document(document_subtopic_rel,rel_subtopic_ave,query_prob,subtopic_word,document_selected,filem)
        document_selected.append(best_document)
        if len(document_selected) == len(documents)-1:
            # all document in ranking list
            break
        
    # print the ranking result to the evaluation files
    for index,document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index+1
        document_score = document.get_ranking_score()
        print >>out_file,topicID +" Q0 "+documentID+" "+str(document_rank) +" "+str(document_score)+ " TESTRUN"
    out_file.close()

示例#4

0

显示文件

文件： xquad_original.py 项目： RainNo1/InfoSystem

def xQuAD(basepath, topicID, cutoff=50):
    """
    return a ranking list using the framework of xQuAD.
    """
    document_selected = []  #docs ranking list

    out_file = open(basepath + "rank/" + topicID + "/xquad_result", "w")
    runlist_file = open(basepath + "rank/" + topicID + "/runlist", "w")
    print >> runlist_file, "xquad_result"
    runlist_file.close()
    documents, subtopicID_word, query, document_term_frequency, average_document_len, word2id_weight = data_preprocessing_xquad(
        basepath, topicID)

    sid_doc_related_probility = similarity_computing(documents,
                                                     subtopicID_word,
                                                     average_document_len,
                                                     query,
                                                     document_term_frequency)

    #iteratively selected the best document to build ranking list
    for i in range(cutoff):
        print "%s-th best document." % i
        best_document = best_document_select(documents, subtopicID_word, query,
                                             document_term_frequency,
                                             average_document_len,
                                             word2id_weight, document_selected,
                                             sid_doc_related_probility)
        document_selected.append(best_document)
        if len(document_selected) == len(documents):
            # all document in ranking list
            break

    # print the ranking result to the evaluation files
    for index, document in enumerate(document_selected):
        documentID = document.get_id()
        document_rank = index + 1
        document_score = document.get_ranking_score()
        print >> out_file, topicID + " Q0 " + documentID + " " + str(
            document_rank) + " " + str(document_score) + " TESTRUN"
    out_file.close()

    # call the evaluation function to evaluate the run result
    r = call_eval_for_result(basepath, topicID, 10)