Пример #1
0
 def _extract_document_summations(cls):
     print "\nExtracting document tf-idf summations for use in Vector Space Cosine..."
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     # for every term in our posting list
     for idx, term in enumerate(cls.posting_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(cls.posting_list)))
         docs = cls.posting_list[term]
         # run through the documents for each term and add the additional tfidf to an accumulation in the dict
         for doc in docs:
             tfidf_addition = qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))
             tfidf_addition_squared = np.square(tfidf_addition)
             if 'tf_idf_sum' in cls.doc_list[doc[0]]:
                 cls.doc_list[doc[0]]['tf_idf_sum'] += tfidf_addition_squared
             else:
                 cls.doc_list[doc[0]]['tf_idf_sum'] = tfidf_addition_squared
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
     
     print "\nExtracting document weight summations for use in Vector Space Cosine..."
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     # Again, we run through each term in our posting list
     for idx, term in enumerate(cls.posting_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(cls.posting_list)))
         docs = cls.posting_list[term]
         # each doc within each term has the VS weight calculated for the terms to find a summation
         for doc in docs:
             weight_addition = float(qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))) / float(cls.doc_list[doc[0]]['tf_idf_sum'])
             weight_addition_squared = np.square(weight_addition)
             if 'sum_weight' in cls.doc_list[doc[0]]:
                 cls.doc_list[doc[0]]['sum_weight'] += weight_addition_squared
             else:
                 cls.doc_list[doc[0]]['sum_weight'] = weight_addition_squared
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
Пример #2
0
def extract_vector_space_cosine_scores(query, index):
    # get terms and term frequencies from the query in format {term: tf}
    if ENV.QUERY_PROCESSING_INDEX == 'PHRASE':
        q_term_info_dict = query.extractValidPhrases(ENV.STOP_TERMS)
    else:
        q_term_info_dict = query.extractTermInformation()
    q_tid_info_dict = {}
    for term_name in q_term_info_dict:
        tid = index.get_term_id_by_term(term_name)
        if tid != None:
            q_tid_info_dict[tid] = q_term_info_dict[term_name]
    
    # retrieve posting list entries for all term ids in format:
    # { termId: [[doc1, tf], [doc2, tf]]}
    relevant_posting_entries = index.get_posting_entries_by_terms(q_term_info_dict.keys())

    # calculate aggregate query term weight summation (for use in query weight function)
    query_total_summation = 0.0
    for term_id in relevant_posting_entries:
        query_total_summation += np.square(qp.calculate_tf_idf(q_tid_info_dict[term_id], index.get_df_by_term_id(term_id), index.get_collection_size()))
    
    document_weights = {}
    for term_id in relevant_posting_entries:
        term_df = index.get_df_by_term_id(term_id)
        query_term_weight = calculate_term_weight(q_tid_info_dict[term_id], term_df, index.get_collection_size(), query_total_summation)
        for doc in relevant_posting_entries[term_id]:
            doc_id = doc[0]
            doc_tf = doc[1]
            document_term_weight = calculate_term_weight(doc_tf, term_df, index.get_collection_size(), index.get_document_weight_summation(doc_id))
            added_weight = [query_term_weight, document_term_weight]
            if added_weight[0] == 0:
                continue
            elif doc_id in document_weights:
                document_weights[doc_id].append(added_weight)
            else:
                document_weights[doc_id] = [[query_term_weight, document_term_weight]]
    final_scores = []
    # for each document, we sum the product of all the weights
    for doc in document_weights:
        final_scores.append([doc, calculate_vector_space_cosine(document_weights[doc], index.get_document_weight_summation2(doc))])
    final_scores.sort(key=operator.itemgetter(1), reverse=True)
    return final_scores
Пример #3
0
def calculate_term_weight(tf, df, collection_size, document_tf_idf_summation):
    return qp.calculate_tf_idf(tf, df, collection_size) / document_tf_idf_summation