def get_tf_idf_documents_ntc(documents_ids, query_words, centroid=[]): """ This function will calculate tf_idf based on lnc method for documents :param centroid: :param documents_ids: union of the documents which contains any words of the given query :param query_words: the words in the given query :return: a list of tuples, each tuple contains document id and a dictionary which contains term and tf_idf based on lnc """ document_term_tf_idf = [] length_inverted_index = len(wl.inverted_index) for doc_id in documents_ids: doc_id = int(doc_id) doc_dic = {} for word in query_words: if word in wl.documents_terms_frequency[doc_id]: tf = wl.documents_terms_frequency[doc_id].get(word) idf = get_df_word(word) idf = math.log(length_inverted_index / idf, 10) doc_dic[word] = tf * idf elif len(centroid) != 0: doc_dic[word] = centroid[ c_tools.get_inverted_index_keys().index(word)] document_term_tf_idf.append([doc_id, doc_dic]) return document_term_tf_idf
def calculate_cosine_similarity_cluster_query(query_dict, centroid): numerator = 0 query_vector_length = rank_tools.calculate_vector_length(list(query_dict.values())) for k, v in query_dict.items(): index = c_tools.get_inverted_index_keys().index(k) numerator += v * centroid['centroid'][index] denumerator = centroid['length'] * query_vector_length if denumerator == 0: return 0 else: return numerator / denumerator
def prepare_frequency_vector(term_freq_dictionary, type_cal='None'): vector = [0] * config.dictionary_size for k, v in term_freq_dictionary.items(): index = c_tools.get_inverted_index_keys().index(k) if type_cal == 'None': vector[index] = v elif type_cal == 'Log': if v == 0: vector[index] = 0 else: vector[index] = 1 + math.log(v, 10) return vector
def prepare_one_document_vector(words): """ This function will create a vector which each elements in it is 0 (the document does not have the element corresponding to its index in dictionary) or non-zero (the document has the element corresponding to its index in the dictionary) :param words: the list of words, which the document has them :return: a vector that its length is equal to dictionary size """ vector = [0] * config.dictionary_size # vector = np.zeros(config.dictionary_size) for key in words: index = c_tools.get_inverted_index_keys().index(key) tf = 1 + math.log(words[key], 10) vector[index] = tf return vector
def get_tf_idf_documents_ltn(documents_ids, query_words, centroid=[]): document_term_tf_idf = [] length_inverted_index = len(wl.inverted_index) for doc_id in documents_ids: doc_id = int(doc_id) doc_dic = {} for word in query_words: if word in wl.documents_terms_frequency[doc_id]: tf = 1 + math.log( wl.documents_terms_frequency[doc_id].get(word), 10) idf = get_df_word(word) idf = math.log(length_inverted_index / idf, 10) doc_dic[word] = tf * idf elif len(centroid) != 0: doc_dic[word] = centroid[ c_tools.get_inverted_index_keys().index(word)] document_term_tf_idf.append([doc_id, doc_dic]) return document_term_tf_idf