def reset(self): global cache_dictionary global final_dictionary global documents_dictionary cache_dictionary = None final_dictionary = None documents_dictionary = None Indexer.reset() Writer.reset() Stemmer.reset() Reader.reset()
def merge_all_posting(stemming_mode, posting_id, number_doc_in_corpus, the_final_terms_dictionary, cach_dictionary, all_city, max_doc_city): #check_uppercase() path_folder_posting, path_folder_abc_posting, stemming_mode, city_path = init_path( stemming_mode) print("merge_all_posting") finish = False number_of_line_in_abc_posting = {} all_final_posting_path = create_final_posting( path_folder_abc_posting, number_of_line_in_abc_posting, city_path) term_first_line_postings = {} freq_sum_doc_first_line_postings = {} the_open_posting_file = {} stemm_dictionary_values = [] if stemming_mode == 'yes': stemm_dictionary = Stemmer.get_dictionary() # all stemming_term stemm_dictionary_values = Stemmer.get_dictionary_value() elif stemming_mode == 'no': stemm_dictionary = Stemmer.get_dictionary_without_stemming( ) # all stemming_term cach_dictionary.clear() terms_to_updated = {} # The terms are in lower case letters close_file = {} # save the first line of each temp posting for index_file_of_posting in range(1, posting_id + 1): file_path = path_folder_posting + "\TempPostings" + str( index_file_of_posting) + '.txt' curr_posting_file = open(file_path, "r") the_open_posting_file[index_file_of_posting] = curr_posting_file close_file[index_file_of_posting] = False find_first_line(curr_posting_file, index_file_of_posting, term_first_line_postings, freq_sum_doc_first_line_postings, close_file) while not finish: #min_temp_posting = min(term_first_line_postings.keys(), key=(lambda index_post: term_first_line_postings[index_post])) min_term = min(term_first_line_postings.values()) all_posting_file_with_equal_term = [] list_doc = {} sum_tf = 0 df = 0 for index, term in term_first_line_postings.items(): if min_term == term: all_posting_file_with_equal_term.append(index) sum_tf = sum_tf + int( (freq_sum_doc_first_line_postings[index])[0]) df = df + int((freq_sum_doc_first_line_postings[index])[1]) list_doc.update((freq_sum_doc_first_line_postings[index])[2]) # Handling capitalization !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if min_term[0].isupper(): # Party # The lowercase_term = min_term.lower() # party # the if lowercase_term in stemm_dictionary: if stemming_mode == 'yes': lowercase_term_after_stemm = stemm_dictionary[ lowercase_term] # parti # the else: lowercase_term_after_stemm = lowercase_term if lowercase_term_after_stemm in terms_to_updated: sum_tf = sum_tf + terms_to_updated[ lowercase_term_after_stemm][0] list_doc.update( terms_to_updated[lowercase_term_after_stemm][1]) terms_to_updated[lowercase_term_after_stemm] = (sum_tf, list_doc) else: terms_to_updated[lowercase_term_after_stemm] = (sum_tf, list_doc) elif stemming_mode == 'yes' and lowercase_term in stemm_dictionary_values: if lowercase_term in terms_to_updated: sum_tf = sum_tf + terms_to_updated[lowercase_term][0] list_doc.update(terms_to_updated[lowercase_term][1]) terms_to_updated[lowercase_term] = (sum_tf, list_doc) else: terms_to_updated[lowercase_term] = (sum_tf, list_doc) else: cach_dictionary[min_term] = sum_tf calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) else: if min_term in terms_to_updated: # parti #the sum_tf = sum_tf + terms_to_updated[min_term][0] cach_dictionary[min_term] = sum_tf list_doc.update(terms_to_updated[min_term][1]) #print("final posting: " + min_term) calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) else: #print("final posting: " + min_term) cach_dictionary[min_term] = sum_tf calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) for i in all_posting_file_with_equal_term: find_first_line(the_open_posting_file[i], i, term_first_line_postings, freq_sum_doc_first_line_postings, close_file) finish = check_if_finish(close_file) ## out while close_all_files(all_final_posting_path) Stemmer.reset() reset_temp_posting() return sum_numbers