Exemplo n.º 1
0
 def save_snippets(self, qno):
     create_dir(os.path.join(self.snippet_dir, str(qno)))
     notfound = []
     for i in range(len(self.doc_scores)):
         doc = self.doc_scores[i][0]
         if not self.snippets.get(doc):
             notfound.append(doc)
             continue
         snips = self.snippets[doc]
         with open(
                 os.path.join(self.snippet_dir, str(qno),
                              'Rank_' + str(i + 1) + '_' + doc + '.html'),
                 'w+') as f:
             f.write('<html><pre><h2>' + self.titles[doc] + '</h2>')
             for j in range(len(snips[:2])):
                 f.write('<div>Snippet ' + str(j + 1) + ':<p>')
                 for word in snips[j][0].replace('\n', ' <br>').split():
                     if parse_stuff(word.lower()) in self.significant_words:
                         f.write('<b>' + word + '</b> ')
                     else:
                         f.write(word + ' ')
                 f.write('</p></div>')
             f.write('</pre></html>')
     if notfound:
         print("Snippets not found for {} ... {}".format(
             len(notfound), notfound))
Exemplo n.º 2
0
 def __init__(self):
     config = load_config()
     corpus_dir = config.get('DIRS', 'corpus_dir')
     self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs'))
     self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
     self.data_parser = DataParser()
     create_dir(self.parsed_dir)
     self.parsed_content = ""
     self.raw_corpus = os.listdir(self.raw_docs)
     self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir'))
     self.stem_file = abspath(config.get('DIRS', 'data_dir'),
                              config.get('FILES', 'stemmed_docs'))
     create_dir(self.stem_dir)
     self.docs = []
Exemplo n.º 3
0
 def eval_to_file(self):
     create_dir(self.eval_dir_path)
     precision_file_name = abspath(self.eval_dir_path,
                                   self.run_name + '_precision.txt')
     recall_file_name = abspath(self.eval_dir_path,
                                self.run_name + '_recall.txt')
     p_at_5_file_name = abspath(self.eval_dir_path,
                                self.run_name + '_p_at_5.txt')
     p_at_20_file_name = abspath(self.eval_dir_path,
                                 self.run_name + '_p_at_20.txt')
     map_mrr_file_name = abspath(self.eval_dir_path,
                                 self.run_name + '_map_mrr.txt')
     Evaluator.pr_to_file(self.precision, precision_file_name)
     Evaluator.pr_to_file(self.recall, recall_file_name)
     Evaluator.p_at_k_to_file(self.p_at_5, p_at_5_file_name)
     Evaluator.p_at_k_to_file(self.p_at_20, p_at_20_file_name)
     Evaluator.map_mrr_to_file(self.map, self.mrr, map_mrr_file_name)
Exemplo n.º 4
0
    def __init__(self, query, scores):
        config = load_config()
        self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'),
                                config.get('DIRS', 'raw_docs'))
        self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'),
                                  config.get('DIRS', 'parsed_dir'))
        self.stoplist = get_stoplist()
        self.significant_words = set(
            [term for term in query.split() if term not in self.stoplist])

        self.dataparser = DataParser()
        self.snippets = {}
        self.snippet_dir = abspath(config.get('DIRS', 'results'),
                                   config.get('DIRS', 'snippet_dir'))
        create_dir(self.snippet_dir)
        self.doc_scores = sorted(scores.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:100]
        self.titles = {}
Exemplo n.º 5
0
 def __init__(self):
     config = load_config()
     self.results_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'ranking'))
     create_dir(self.results_dir)
Exemplo n.º 6
0
 def save_index(self):
     create_dir(self.index_dir)
     dict_to_file(self.index, self.index_file)
     print('Index saved to ' + self.index_file)