def fn_search(self, query, expand_mode, stem_mode, folder_path, summ_mode): path = folder_path + path_delimiter + "stop_words.txt" Parser.set_stop_words_file(path) Parser.set_stemmer_mode(stem_mode) start_search = time.time( ) # Time elapsed since query received until results came back searcher = Searcher.Searcher() if expand_mode == 0 and summ_mode == 0: # regular, manual query search results = searcher.search(stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, query) file_num = len(results) # The number of results end_search = time.time() total_time = end_search - start_search self.display_results(total_time, file_num, results) elif summ_mode == 1: doc_id = query results = searcher.find_popular_sentences( stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, doc_id, get_text_from_document(doc_id, folder_path)) else: # We need to expand the query results = searcher.expand(stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, query) file_num = len(results) # The number of results end_search = time.time() total_time = end_search - start_search self.display_results(total_time, file_num, results)
def handle_files(file_list_ref, documents_dictionary_ref): terms_dictionary = {} if stem_mode: # This code take a document's text from the list and parsing & stemming the text for value in file_list_ref: doc_id = value[0] file_name = value[2] after_stemming = Stemmer.stemWithCache(Parser.start(value[1])) # This function update the document parameters __update_and_merge_dictionaries(doc_id, file_name, terms_dictionary, documents_dictionary_ref, after_stemming) # This function merge all the dictionary in loop and create dictionary for the whole part else: # This code take a document's text from the list and only parsing the text for value in file_list_ref: doc_id = value[0] file_name = value[2] after_parse = Parser.start(value[1]) # This function update the document parameters __update_and_merge_dictionaries(doc_id, file_name, terms_dictionary, documents_dictionary_ref, after_parse) # This function merge all the dictionary in loop and create dictionary for the whole part # This function create new temp posting file for each part Indexer.create_temp_posting_file(terms_dictionary)
def data_set_Path(self, path): global corpus_path print("Received corpus folder..") corpus_path = path + "/corpus" print("Received stopwords filename..") global __stopwords_path __stopwords_path = path + "/stop_words.txt" Parser.set_stop_words_file(__stopwords_path)
def start_evaluating_qry(self, searcher, q_file_path, semantic_model, str_single_qry, mode_semantic, stemmer): self.init_helpers() qry_parser = Parser(self.hash_stopwords, self.hash_keywords_months, self.hash_keywords_prices, self.hash_punc, self.hash_punc_middle, self.hash_alphabet, self.stemmer, self.hash_qry_stopwords) if str_single_qry == '': file_path = q_file_path skip_one = 0 with open(file_path, 'r') as file: q_counter = 0 data = file.read() data_list = data.split("<top>") del data for qry in data_list: if skip_one == 1: q_counter += 1 qry = "<top>" + qry qry_parser.start_parse(qry, 0, semantic_model, 0, mode_semantic, stemmer) else: skip_one = 1 else: qry_parser.start_parse(str_single_qry, 0, semantic_model, 1, mode_semantic, stemmer) hash_titles = qry_parser.hash_titles hash_qry_terms = qry_parser.hash_terms # searcher.ranker.set_params(5, 0.05, 5, 1) searcher.search(hash_qry_terms, hash_titles)
def fn_run_query_file(self, query_file_path, stem_mode, folder_path): start_search = time.time( ) # Time elapsed since query received until results came back path = folder_path + path_delimiter + "stop_words.txt" Parser.set_stop_words_file(path) Parser.set_stemmer_mode(stem_mode) if stem_mode: Searcher.idf_weight = 0.8 Searcher.df_weight = 0.7 Searcher.denominator = 1.3 Ranker.bm25_weight = 0.1 Ranker.cos_sim_weight = 0.9 Ranker.b = 0.6 Ranker.k = 1.85 Ranker.bm25_lambda = 0.25 Ranker.bm25_idf = 0.5 else: Searcher.idf_weight = 0.8 Searcher.df_weight = 0.7 Searcher.denominator = 1.3 Ranker.bm25_weight = 0.1 Ranker.cos_sim_weight = 0.9 Ranker.b = 0.6 Ranker.k = 1.85 Ranker.bm25_lambda = 0.25 Ranker.bm25_idf = 0.5 searcher = Searcher.Searcher() results = searcher.multi_search( stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, Reader.extract_queries(query_file_path)) file_num = len(results) # The number of results end_search = time.time() total_time = end_search - start_search self.display_query_file_results(total_time, file_num, results)
def parse_file(self, file_path): if len(self.hash_stopwords) == 0: self.init_helpers() global f_counter p = None file_terms = {} p_name = "#NUM_" + str(f_counter.value) with f_counter.get_lock(): f_counter.value += 1 f_start = time.time() p = Parser(self.hash_stopwords, self.hash_keywords_months, self.hash_keywords_prices, self.hash_punc, self.hash_punc_middle, self.hash_alphabet, self.stemmer, None) self.get_doc_from_file(file_path, p) for c in self.final_solution: while c in p.hash_terms: del p.hash_terms[c] for term in self.hash_stopwords: while term in p.hash_terms or term.upper() in p.hash_terms: del p.hash_terms[term] if '' in p.hash_terms: del p.hash_terms[''] if len(p.hash_terms) > 1: with open( self.post_path + '/Engine_Data/temp_hash_objects/file_hash_' + p_name + '.pkl', 'wb') as output: pickle.dump(p.hash_terms, output, pickle.HIGHEST_PROTOCOL) with open( self.post_path + '/Engine_Data/Cities_hash_objects/hash_cities' + p_name + '.pkl', 'wb') as output: pickle.dump(p.hash_cities, output, pickle.HIGHEST_PROTOCOL) with open( self.post_path + '/Engine_Data/Docs_hash_objects/hash_docs' + p_name + '.pkl', 'wb') as output: pickle.dump(p.hash_docs, output, pickle.HIGHEST_PROTOCOL) file_terms = {} self.vocabulary = {} f_end = time.time() time_to_file = f_end - f_start if f_counter.value % 20 == 0: p_c = float(f_counter.value) p_c = int(p_c * 100 / self.number_of_files) if p_c != self.percent: self.percent = p_c self.print_prog(p_c)
def multi_search(self, stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, query_list): if final_dictionary is None or cache_dictionary is None or documents_dictionary is None: return [] else: list_for_return = [] self.__set_folder_path(folder_path) self.__set_stem_mode(stem_mode) for query_tuple in query_list: # iterates over the queries in the query file given # This code take a document's text from the list and parsing & stemming the text query_num = query_tuple[0] title = query_tuple[1] description = query_tuple[2] narrative = query_tuple[3] narrative_sentences = Parser.sentences(narrative) new_narrative = '' for sentence in narrative_sentences: if "not relevant" not in sentence: new_narrative += " " + sentence elif stem_mode and "are relevant" in sentence: new_narrative += " " + sentence narrative = new_narrative # We define a words we want to avoid in our parse and stem terms_dictionary = self.__parse_stem(title) description_dictionary = self.__parse_stem(description + ' ' + narrative) if self.stem_mode: words_to_avoid = { "relev", "document", "discuss", "consid", "i.e", "issu" } else: words_to_avoid = { "relevant", "documents", "document", "discuss", "discussing", "information", "considered", "i.e", "issues" } description_dictionary_other = {} for key, value in description_dictionary.items(): if key in final_dictionary and key not in words_to_avoid: # idf * query_tf term_idf = final_dictionary[key][0][0] description_dictionary_other[key] = (self.idf_weight * term_idf) + \ (self.df_weight * value) results_num = int( len(description_dictionary_other) / self.denominator) description_dictionary_other = dict( Counter(description_dictionary_other).most_common( results_num)) for key, value in description_dictionary_other.items(): if key in terms_dictionary: terms_dictionary[key] += description_dictionary_other[ key] else: terms_dictionary[key] = description_dictionary_other[ key] dictionary = self.ranker.rank( self.folder_path, final_dictionary, cache_dictionary, documents_dictionary, terms_dictionary, REGULAR_RESULTS_NUMBER, stem_mode) list_for_return.append((query_num, list(dictionary.keys()))) return list_for_return
def __parse_stem(self, text): if self.stem_mode: # stem mode is True after_dictionary = Stemmer.stemWithCache(Parser.start(text)) else: # stem mode is False after_dictionary = Parser.start(text) return after_dictionary
def find_popular_sentences(self, stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, doc_id, text): # If the dictionaries are empty, we cant continue if final_dictionary is None or cache_dictionary is None or documents_dictionary is None: return [] else: self.__set_folder_path(folder_path) self.__set_stem_mode(stem_mode) # We need to parse and stem the text again, because it's slow to fetch the values from posting after_dictionary = self.__parse_stem(text) # We want to normalize the values with the max_tf value max_tf = documents_dictionary[doc_id][0] for key, value in after_dictionary.items( ): # Here we update the tf of the document after_dictionary[key] = value / max_tf list_of_sentence = Parser.sentences( text) # Here we get the list of sentences from Parser # Here we initialize the dictionary of sentences sentence_dictionary = {} for sentence_index in range(0, len(list_of_sentence)): sentence_dictionary[sentence_index] = 0 term_with_after_dictionary = { } # dictionary with dictionary after parsing and stemming sentence_index = 0 for sentence in list_of_sentence: # for every sentence, we doing the following steps terms_dictionary = self.__parse_stem(sentence) # We delete the terms that not in the final dictionary keys_to_delete = [] for term in terms_dictionary.keys(): if term in final_dictionary: pass else: keys_to_delete.append(term) for term in keys_to_delete: terms_dictionary.pop(term) # Here we save the dictionary after parsing and stemming for later uses term_with_after_dictionary[sentence_index] = terms_dictionary sentence_index += 1 # For every sentence we calculate the rank value with the formula tf*idf*freq for sentence_index, terms_dictionary in term_with_after_dictionary.items( ): if terms_dictionary is not None: for term, value in terms_dictionary.items(): freq = terms_dictionary[term] idf = final_dictionary[term][0][0] tf = after_dictionary[term] sentence_dictionary[sentence_index] += tf * idf * freq # Here we choose only the SENTENCE_NUMBER of the best sentences list_of_best_sentence = list( dict( Counter(sentence_dictionary).most_common( SENTENCE_NUMBER)).keys()) # for each key in list_of_best_sentence we update the value to be the sentence sentence_index = 0 for key in list_of_best_sentence: list_of_best_sentence[sentence_index] = list_of_sentence[key] sentence_index += 1 return list_of_best_sentence
def Mega_Test(self, query_file_path, stem_mode, folder_path): path = folder_path + path_delimiter + "stop_words.txt" Parser.set_stop_words_file(path) Parser.set_stemmer_mode(stem_mode) searcher = Searcher.Searcher() results_dict = {} query_rel_docs = { } # The dictionary containing each query id (key) and the RELEVANT documents (value, set) file_types = (("Comma Separated Value Document", "*.csv"), ) rel_doc_path = askopenfilename(title="Choose result csv file", filetypes=file_types) file_types = (("Any file", "*.*"), ) save_path = asksaveasfilename( title="Choose where to save BM25 results", filetypes=file_types, initialfile="mega_test.csv") docs = open(rel_doc_path).read() docs = docs.split('\n') docs = docs[:-1] for entry in docs: tuple = entry.split(',') qid = tuple[0] doc = tuple[1] if qid in query_rel_docs: query_rel_docs[tuple[0]].add(doc) else: query_rel_docs[tuple[0]] = set() query_rel_docs[tuple[0]].add(doc) Searcher.denominator = 1.3 # Range [1 : 4] Searcher.df_weight = 0.7 # Range (0 : 1] Searcher.idf_weight = 0.2 Ranker.bm25_weight = 0.05 Ranker.cos_sim_weight = 0.95 # doesn't need a loop Ranker.bm25_k = 1.2 # Range: [1.2 - 2.0] Ranker.bm25_b = 0.1 # Range: 0.45-1.0 Ranker.bm25_lambda = 0.2 # Range: 0-1.0 columns = 'bm25 weight, cos sim weight, bm25 k, bm25 b, bm25 lambda, Denominator, searcher idf, searcher df, score\n' srp = open(save_path, 'a') srp.write(columns) srp.close() while Ranker.bm25_k <= 2.01: Ranker.bm25_b = 0.1 while Ranker.bm25_b <= 1.01: results = searcher.multi_search( stem_mode, folder_path, final_dictionary, cache_dictionary, documents_dictionary, Reader.extract_queries(query_file_path)) score = 0 for entry in results: qid = entry[0] rel_set = query_rel_docs[qid] returned_set = set(entry[1]) intersection_set = rel_set.intersection(returned_set) score += len(intersection_set) result_score = "%f, %f, %f, %f, %f, %f, %f, %f, %d\n" % \ (Ranker.bm25_weight, Ranker.cos_sim_weight, Ranker.bm25_k, Ranker.bm25_b,Ranker.bm25_lambda, Searcher.denominator, Searcher.idf_weight, Searcher.df_weight, score) srp = open(save_path, 'a') srp.write(result_score) srp.close() Ranker.bm25_b += 0.1 Ranker.bm25_k += 0.1
def startSearch(self, stemBool): global stem_mode stem_mode = stemBool Indexer.set_stemmer_mode(stemBool) Parser.set_stemmer_mode(stemBool) self.run() #begin indexing!
def find(self, query, stem_mode): total_value = 0 for x in range(0, 5): # We want to use this value for the weights total_value += x sum_of_df = 0 wiki_wiki = wikipediaapi.Wikipedia( 'en') # Which language we want to search the term for page_py = wiki_wiki.page(query) # Define the query in the file query_dictionary = {} # The dictionary we will return to the user if page_py.exists(): line = page_py.summary # Here we collect the summary about the page in wiki if len( line ) < 300: # If we wiki didn't return a specific term, we ask for the sections line = print_sections(page_py.sections) if stem_mode: stop_set = {'disambigu'} # Popular words we want to avoid query_after = Stemmer.stemWithCache(Parser.start(query)) terms_dictionary = Stemmer.stemWithCache(Parser.start(line)) else: stop_set = {'Disambiguation'} # Popular words we want to avoid query_after = Parser.start(query) terms_dictionary = Parser.start(line) concept = {} links = page_py.links # Here we collect the links from the page in wiki for title in sorted(links.keys()): if stem_mode: term = Stemmer.stemWithCache( Parser.start(links[title].title)) else: term = Parser.start(links[title].title) for t, value in term.items( ): # For each term in summary dictionary, we need to check the values if links[title].ns == 0 and t in terms_dictionary and \ t not in query_after and t not in stop_set: if t not in concept: concept[t] = value else: concept[ t] += value # we want to add the value (the df to the dictionary) # Here we ask only for most common query results query_dictionary = dict( Counter(concept).most_common(number_of_results)) for term, value in query_dictionary.items(): sum_of_df += value for term, value in query_dictionary.items(): positive_value = int(total_value * value / sum_of_df) + 1 if positive_value == 0: positive_value = 1 query_dictionary[term] = positive_value if len(query_after) is not 0: query = list(query_after.keys())[0] else: print("Invalid query") query_dictionary[query] = number_of_results return query_dictionary