Пример #1
0
def get_tf_idf_documents_ntc(documents_ids, query_words, centroid=[]):
    """
    This function will calculate tf_idf based on lnc method for documents
    :param centroid:
    :param documents_ids: union of the documents which contains any words of the given query
    :param query_words: the words in the given query
    :return: a list of tuples, each tuple contains document id and a dictionary which contains term and tf_idf based on lnc
    """
    document_term_tf_idf = []
    length_inverted_index = len(wl.inverted_index)
    for doc_id in documents_ids:
        doc_id = int(doc_id)
        doc_dic = {}
        for word in query_words:
            if word in wl.documents_terms_frequency[doc_id]:
                tf = wl.documents_terms_frequency[doc_id].get(word)
                idf = get_df_word(word)
                idf = math.log(length_inverted_index / idf, 10)
                doc_dic[word] = tf * idf
            elif len(centroid) != 0:
                doc_dic[word] = centroid[
                    c_tools.get_inverted_index_keys().index(word)]

        document_term_tf_idf.append([doc_id, doc_dic])
    return document_term_tf_idf
Пример #2
0
def calculate_cosine_similarity_cluster_query(query_dict, centroid):
    numerator = 0
    query_vector_length = rank_tools.calculate_vector_length(list(query_dict.values()))
    for k, v in query_dict.items():
        index = c_tools.get_inverted_index_keys().index(k)
        numerator += v * centroid['centroid'][index]

    denumerator = centroid['length'] * query_vector_length
    if denumerator == 0:
        return 0
    else:
        return numerator / denumerator
Пример #3
0
def prepare_frequency_vector(term_freq_dictionary, type_cal='None'):
    vector = [0] * config.dictionary_size
    for k, v in term_freq_dictionary.items():
        index = c_tools.get_inverted_index_keys().index(k)
        if type_cal == 'None':
            vector[index] = v
        elif type_cal == 'Log':
            if v == 0:
                vector[index] = 0
            else:
                vector[index] = 1 + math.log(v, 10)
    return vector
Пример #4
0
def prepare_one_document_vector(words):
    """
    This function will create a vector which each elements in it is 0 (the document does not have the element corresponding
     to its index in dictionary) or non-zero (the document has the element corresponding to its index in the dictionary)
    :param words: the list of words, which the document has them
    :return: a vector that its length is equal to dictionary size
    """
    vector = [0] * config.dictionary_size
    # vector = np.zeros(config.dictionary_size)
    for key in words:
        index = c_tools.get_inverted_index_keys().index(key)
        tf = 1 + math.log(words[key], 10)
        vector[index] = tf

    return vector
Пример #5
0
def get_tf_idf_documents_ltn(documents_ids, query_words, centroid=[]):
    document_term_tf_idf = []
    length_inverted_index = len(wl.inverted_index)
    for doc_id in documents_ids:
        doc_id = int(doc_id)
        doc_dic = {}
        for word in query_words:
            if word in wl.documents_terms_frequency[doc_id]:
                tf = 1 + math.log(
                    wl.documents_terms_frequency[doc_id].get(word), 10)
                idf = get_df_word(word)
                idf = math.log(length_inverted_index / idf, 10)
                doc_dic[word] = tf * idf
            elif len(centroid) != 0:
                doc_dic[word] = centroid[
                    c_tools.get_inverted_index_keys().index(word)]

        document_term_tf_idf.append([doc_id, doc_dic])
    return document_term_tf_idf