def save_snippets(self, qno): create_dir(os.path.join(self.snippet_dir, str(qno))) notfound = [] for i in range(len(self.doc_scores)): doc = self.doc_scores[i][0] if not self.snippets.get(doc): notfound.append(doc) continue snips = self.snippets[doc] with open( os.path.join(self.snippet_dir, str(qno), 'Rank_' + str(i + 1) + '_' + doc + '.html'), 'w+') as f: f.write('<html><pre><h2>' + self.titles[doc] + '</h2>') for j in range(len(snips[:2])): f.write('<div>Snippet ' + str(j + 1) + ':<p>') for word in snips[j][0].replace('\n', ' <br>').split(): if parse_stuff(word.lower()) in self.significant_words: f.write('<b>' + word + '</b> ') else: f.write(word + ' ') f.write('</p></div>') f.write('</pre></html>') if notfound: print("Snippets not found for {} ... {}".format( len(notfound), notfound))
def __init__(self): config = load_config() corpus_dir = config.get('DIRS', 'corpus_dir') self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs')) self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir')) self.data_parser = DataParser() create_dir(self.parsed_dir) self.parsed_content = "" self.raw_corpus = os.listdir(self.raw_docs) self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir')) self.stem_file = abspath(config.get('DIRS', 'data_dir'), config.get('FILES', 'stemmed_docs')) create_dir(self.stem_dir) self.docs = []
def eval_to_file(self): create_dir(self.eval_dir_path) precision_file_name = abspath(self.eval_dir_path, self.run_name + '_precision.txt') recall_file_name = abspath(self.eval_dir_path, self.run_name + '_recall.txt') p_at_5_file_name = abspath(self.eval_dir_path, self.run_name + '_p_at_5.txt') p_at_20_file_name = abspath(self.eval_dir_path, self.run_name + '_p_at_20.txt') map_mrr_file_name = abspath(self.eval_dir_path, self.run_name + '_map_mrr.txt') Evaluator.pr_to_file(self.precision, precision_file_name) Evaluator.pr_to_file(self.recall, recall_file_name) Evaluator.p_at_k_to_file(self.p_at_5, p_at_5_file_name) Evaluator.p_at_k_to_file(self.p_at_20, p_at_20_file_name) Evaluator.map_mrr_to_file(self.map, self.mrr, map_mrr_file_name)
def __init__(self, query, scores): config = load_config() self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'), config.get('DIRS', 'raw_docs')) self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'), config.get('DIRS', 'parsed_dir')) self.stoplist = get_stoplist() self.significant_words = set( [term for term in query.split() if term not in self.stoplist]) self.dataparser = DataParser() self.snippets = {} self.snippet_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'snippet_dir')) create_dir(self.snippet_dir) self.doc_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:100] self.titles = {}
def __init__(self): config = load_config() self.results_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'ranking')) create_dir(self.results_dir)
def save_index(self): create_dir(self.index_dir) dict_to_file(self.index, self.index_file) print('Index saved to ' + self.index_file)