def calculate_tf_dic(dic_doc, dic_len, average_length, k, b): bm_tf_dic = {} for key, v in dic_doc.items(): counts_terms = pre_processer.calculate_freq(v) bm_tf_doc = {} for t, f in counts_terms.items(): bm_tf_doc[t] = f / (f + k * (1 - b + (b * dic_len[key] / average_length))) bm_tf_dic[key] = bm_tf_doc return bm_tf_dic
def calculate_tf_dic(dic_doc): #Calculate tf and saved it in a dictionary doc:(term,tf) tf_term_doc = {} for k, v in dic_doc.items(): counts_terms = pre_processer.calculate_freq(v) terms = {} maximum = max(counts_terms.values()) for t, c in counts_terms.items(): terms[t] = c / maximum tf_term_doc[k] = terms return tf_term_doc
def calculate_tf_dic_sent(list_sent, sent_len, sent_average_length, k, b): bm_tf_dic = {} for sent in list_sent: str_sent = ' '.join(sent) counts_terms = pre_processer.calculate_freq(sent) bm_tf_doc = {} for t, f in counts_terms.items(): bm_tf_doc[t] = f / ( f + k * (1 - b + (b * sent_len[str_sent] / sent_average_length))) bm_tf_dic[str_sent] = bm_tf_doc return bm_tf_dic
def obtain_query_weights(queries, idf_dic): query_weights = [] for query in queries: query_vector = [] freq_query = pre_processer.calculate_freq(query) #terms_query = {} maximum = max(freq_query.values()) for term, freq in freq_query.items(): try: query_vector.append((freq / maximum) * idf_dic[term]) except: query_vector.append(0) query_weights.append(query_vector) return query_weights
def calculate_query_weights(queries, doc_len, average_length_query, bm_idf_dic, k, b): bm_query_weights = [] for n, query in enumerate(queries): query_vector = [] freq_query = pre_processer.calculate_freq(query) for term, f in freq_query.items(): try: query_vector.append( f / (f + k * (1 - b + (b * doc_len[n] / average_length_query))) * bm_idf_dic[term]) except: query_vector.append(0) bm_query_weights.append(query_vector) return bm_query_weights
def main(): dic_raw_doc = pre_processer.read_xml('./data/trec_documents.xml') #Total number of documents. N = len(dic_raw_doc.keys()) #Dictionary where the tokenized documents will be stored. dic_doc = {} #Dictionary where the counts for for each term in all documents will be stored. dic_count_doc = {} #Tokenize and count number of documents in which each token is. for k, v in dic_raw_doc.items(): #Tokenize document in 2 steps. value_splitted = v.split('\n') value_tokenized = pre_processer.tokenizer(value_splitted) #Store tokenized document in dictionary. dic_doc[k] = value_tokenized #Loop over the unique set of tokens. for word in set(value_tokenized): #Add one to the count if already on dictionary. if word in dic_count_doc: dic_count_doc[word] += 1 #Add to dictionary else: dic_count_doc[word] = 1 idf_dic = basic_tfidf.calculate_idf_dic(dic_count_doc, N) tf_dic = basic_tfidf.calculate_tf_dic(dic_doc) queries = basic_tfidf.obtain_queries('./data/test_questions.txt') query_weights = basic_tfidf.obtain_query_weights(queries, idf_dic) doc_query_weights = basic_tfidf.obtain_doc_query_weights( queries, tf_dic, idf_dic) tf_idf_cos_similarity = cos_similarity.calculate_cos_similarity( query_weights, doc_query_weights) output_per_query = evaluation_results.select_output_query( tf_idf_cos_similarity) answers = evaluation_results.get_answers('./data/patterns.txt') scores = evaluation_results.precision_f_score(answers, output_per_query, dic_raw_doc) evaluation_results.write_results(scores[0], scores[1], scores[2], scores[3], './results/results_basic_tf-idf.txt') output_per_query_1000 = evaluation_results.select_doc_query( tf_idf_cos_similarity) bm_idf_dic = bm_implementation.calculate_idf_dic(dic_count_doc, N) doc_len = bm_implementation.create_dic_lengths(dic_doc) average_length = sum(doc_len.values()) / (len(doc_len.keys())) k_list = [1, 2, 3, 4, 5] k_results = [] for k in k_list: bm_tf_dic = bm_implementation.calculate_tf_dic(dic_doc, doc_len, average_length, k, 0.1) dic_len_queries = {n: len(q) for n, q in enumerate(queries)} query_average_length = sum(dic_len_queries.values()) / (len( dic_len_queries.keys())) bm_query_weights = bm_implementation.calculate_query_weights( queries, dic_len_queries, query_average_length, bm_idf_dic, 1, 0.6) bm_doc_query_weights = bm_implementation.obtain_doc_query_weights( queries, bm_tf_dic, bm_idf_dic) bm_cos_similarity = cos_similarity.calculate_cos_similarity( bm_query_weights, bm_doc_query_weights) bm_output_per_query = evaluation_results.select_output_query( bm_cos_similarity) #This two are just for comparing normal use of bm 25 to tf-idf, not for final results. #bm_scores = evaluation_results.precision_f_score(answers,bm_output_per_query,dic_raw_doc) #evaluation_results.write_results(bm_scores[0],bm_scores[1],bm_scores[2],bm_scores[3],bm_scores[4],'./results/results_bm.txt') rerank_bm_doc_query_weights = bm_implementation.reweight_matrix( bm_cos_similarity, output_per_query_1000) rerank_bm_output_per_query = evaluation_results.select_output_query( rerank_bm_doc_query_weights) rerank_bm_scores = evaluation_results.precision_f_score( answers, rerank_bm_output_per_query, dic_raw_doc) path = './results/results_rerank_bm' + '_k_value_' + str(k) + '.txt' #print(rerank_bm_scores[0],rerank_bm_scores[1],rerank_bm_scores[2],rerank_bm_scores[3],rerank_bm_scores[4]) evaluation_results.write_results(rerank_bm_scores[0], rerank_bm_scores[1], rerank_bm_scores[2], rerank_bm_scores[3], path) # Part 3: Sentence Ranker # Get the top 50 documents and split them into sentences top_50_docs, list_sent, list_words = bm_implementation.get_top_50_docs( bm_output_per_query, dic_raw_doc) #Total number of documents N_sent = len(list_sent) # Calculate idf and saved it in a dictionary doc:(term,tf) sent_idf_dic = bm_implementation.calculate_sent_idf_dic( list_sent, list_words, N_sent) # Calculate tf and saved it in a dictionary doc:(term,tf) tf_term_sent = {} for sent in list_sent: tf_term_sent[' '.join(sent)] = pre_processer.calculate_freq(sent) sent_len = dl_dic = { ' '.join(list_sent[x]): len(list_sent[x]) for x in range(len(list_sent)) } sent_average_length = sum(sent_len.values()) / (len(sent_len.keys())) tf_term_sent = bm_implementation.calculate_tf_dic_sent( list_sent, sent_len, sent_average_length, 1, 0.6) sent_query_weights = bm_implementation.calculate_query_weights( queries, dic_len_queries, query_average_length, sent_idf_dic, 1, 0.6) bm_sent_query_weights = bm_implementation.obtain_doc_query_weights( queries, tf_term_sent, sent_idf_dic) sent_cos_similarity = cos_similarity.calculate_cos_similarity( sent_query_weights, bm_sent_query_weights) sent_output_per_query = evaluation_results.select_output_query( sent_cos_similarity) print('The MRR is', evaluation_results.calculate_mrr(answers, sent_output_per_query))