def _lauch_engine(self): # Create boolean or vectorial search engine if self.config['Research_engine']['type'] == 'vectorial': research_engine = Vectorial_search( reverse_index=self.reverse_index, similarity=self.config['Vectorial_search']['similarity'], ) query_processor = Process_query('sources/common_words', 'vectorial') elif self.config['Research_engine']['type'] == 'boolean': research_engine = Boolean_search( reverse_index=self.reverse_index, p_norm=self.config['Boolean_search']['p_norm'], default_similarity=self.config['Boolean_search'] ['default_similarity']) query_processor = Process_query('sources/common_words', 'boolean') elif self.config['Research_engine']['type'] == 'probabilistic': research_engine = Probabilistic_search( reverse_index=self.reverse_index, rsv_relevant_method=self.config['Probabilistic_search'] ['rsv_relevant_method']) query_processor = Process_query('sources/common_words', 'probabilistic') else: raise ValueError('Unsupported research engine type!') max_results_number = self.config['Research_engine'][ 'max_results_number'] while 1: query = raw_input('Enter your query: ') t0 = time.time() results = research_engine.do_search( query_processor.format_query(query)) print len(results), 'results in', time.time() - t0, 'seconds' if max_results_number > 0 and len(results) > max_results_number: results = results[:max_results_number] print 'printing only the first', max_results_number, 'results: \n' print 'document id \t score' print '-----------------------------' for (document_id, score) in results: print document_id, '\t\t', score
def _lauch_engine(self): # Create boolean or vectorial search engine if self.config['Research_engine']['type'] == 'vectorial': research_engine = Vectorial_search( reverse_index=self.reverse_index, similarity=self.config['Vectorial_search']['similarity'], ) query_processor = Process_query('sources/common_words', 'vectorial') elif self.config['Research_engine']['type'] == 'boolean': research_engine = Boolean_search( reverse_index=self.reverse_index, p_norm=self.config['Boolean_search']['p_norm'], default_similarity=self.config['Boolean_search']['default_similarity'] ) query_processor = Process_query('sources/common_words', 'boolean') elif self.config['Research_engine']['type'] == 'probabilistic': research_engine = Probabilistic_search( reverse_index=self.reverse_index, rsv_relevant_method=self.config['Probabilistic_search']['rsv_relevant_method'] ) query_processor = Process_query('sources/common_words', 'probabilistic') else: raise ValueError('Unsupported research engine type!') max_results_number = self.config['Research_engine']['max_results_number'] while 1: query = raw_input('Enter your query: ') t0 = time.time() results = research_engine.do_search(query_processor.format_query(query)) print len(results), 'results in', time.time()-t0, 'seconds' if max_results_number > 0 and len(results) > max_results_number: results = results[:max_results_number] print 'printing only the first', max_results_number, 'results: \n' print 'document id \t score' print '-----------------------------' for (document_id, score) in results: print document_id, '\t\t', score
def run_testing(self): print 'Launching tests!' print 'Loading documents...', reverse_index_builder = Reverse_index_builder( ponderation_method=self.config['Reverse_index']['ponderation'], index_type=self.config['Reverse_index']['index_type'], save_folder_path=self.config['Reverse_index']['save_folder_path'] ) reverse_index = reverse_index_builder.create_reverse_index('sources/cacm.all', 'sources/common_words') print ' Done' print 'Loading test data...', # {query: [answer1, answer2...]} self.query_answer = self._parse_queries_answers(self.queries_filename, self.answers_filename) print ' Done' print 'Initializing variables...', time_parsing_queries = 0. time_doing_researches = 0. precision = [] recall = [] r_measure = [] f_measure = [] average_precision = [] if self.config['Research_engine']['type'] == 'vectorial': search_engine = Vectorial_search(reverse_index, self.similarity_method) elif self.config['Research_engine']['type'] == 'boolean': search_engine = Boolean_search(reverse_index, self.p_norm, self.default_similarity) elif self.config['Research_engine']['type'] == 'probabilistic': search_engine = Probabilistic_search(reverse_index, self.rsv_relevant_method) query_processor = Process_query(stop_list_filename='sources/cacm.all', format_type=self.config['Research_engine']['type']) print ' Done' t0 = time.time() print 'Let\'s get to it! (this may take 5-10 seconds)' for query in self.query_answer: expected_answers = self.query_answer[query] t_init = time.time() processed_query = query_processor.format_query(query) t_parse = time.time() time_parsing_queries += t_parse - t_init answers_with_score = search_engine.do_search(processed_query) answers = map(lambda (x, y): x, answers_with_score) t_query = time.time() time_doing_researches += t_query - t_parse precision.append(self._compute_precision(answers, expected_answers)) recall.append(self._compute_recall(answers, expected_answers)) r_measure.append(self._compute_r_measure(answers, expected_answers)) f_measure.append(self._compute_f_measure(precision[-1], recall[-1])) average_precision.append(self._compute_average_precision(answers, expected_answers)) number_of_tests = float(len(self.query_answer)) print 'Number of queries tested:', int(number_of_tests), 'in', round(time.time() - t0, 2), 'seconds' print 'Average time spent on query processing:', time_parsing_queries / number_of_tests, 'seconds', print ', doing the research:', time_doing_researches / number_of_tests, 'seconds' print 'Average time spent on a query (total):', (time_doing_researches + time_parsing_queries) / number_of_tests, 'seconds' print """ ################################### # PERFORMANCE MEASURES # ###################################""" print 'Max Precision:', max(precision), 'average:', reduce(lambda x, y: x + y, precision) / float(len(precision)) print 'Max Recall:', max(recall), 'average:', reduce(lambda x, y: x + y, recall) / float(len(recall)) print 'Max F-measure', max(f_measure), 'average:', reduce(lambda x, y: x + y, f_measure) / float(len(f_measure)) print 'Min E-measure', 1 - max(f_measure), 'average:', 1 - reduce(lambda x, y: (x + y), f_measure) / float(len(f_measure)) print 'Max R-measure', max(r_measure), 'average:', reduce(lambda x, y: x + y, r_measure) / float(len(r_measure)) print 'Mean Average Precision (MAP)', reduce(lambda x, y: x + y, average_precision) / float(len(average_precision))