Пример #1
0
 def __init__(self):
     config = load_config()
     self.query_file = abspath(config.get('DIRS', 'data_dir'),
                               config.get('FILES', 'query_file'))
     self.stem_query_file = abspath(config.get('DIRS', 'data_dir'),
                                    config.get('FILES', 'stem_query_file'))
     self.data_parser = DataParser()
Пример #2
0
 def __init__(self, model):
     config = load_config()
     data_dir = config.get('DIRS', 'data_dir')
     stopwords_file = abspath(data_dir, config.get('FILES', 'common_words'))
     corpus_dir = config.get('DIRS', 'corpus_dir')
     self.stopwords = read_file(stopwords_file).split('\n')
     self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
     self.model = model
Пример #3
0
 def __init__(self):
     config = load_config()
     corpus_dir = config.get('DIRS', 'corpus_dir')
     self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs'))
     self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
     self.data_parser = DataParser()
     create_dir(self.parsed_dir)
     self.parsed_content = ""
     self.raw_corpus = os.listdir(self.raw_docs)
     self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir'))
     self.stem_file = abspath(config.get('DIRS', 'data_dir'),
                              config.get('FILES', 'stemmed_docs'))
     create_dir(self.stem_dir)
     self.docs = []
Пример #4
0
 def __init__(self, file_name, run_name):
     config = load_config()
     self.run_name = run_name
     self.results_file_path = abspath(config.get('DIRS', 'results'),
                                      config.get('DIRS', 'ranking'),
                                      file_name)
     self.eval_dir_path = abspath(config.get('DIRS', 'results'),
                                  config.get('DIRS', 'eval_dir'), run_name)
     self.file_name = file_name
     self.rel_data = get_relevance_data()
     self.run = self.get_run()
     self.precision = {}
     self.p_at_5 = {}
     self.p_at_20 = {}
     self.recall = {}
     self.ap = {}
     self.map = 0.0
     self.rr = {}
     self.mrr = 0.0
Пример #5
0
    def __init__(self, query, scores):
        config = load_config()
        self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'),
                                config.get('DIRS', 'raw_docs'))
        self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'),
                                  config.get('DIRS', 'parsed_dir'))
        self.stoplist = get_stoplist()
        self.significant_words = set(
            [term for term in query.split() if term not in self.stoplist])

        self.dataparser = DataParser()
        self.snippets = {}
        self.snippet_dir = abspath(config.get('DIRS', 'results'),
                                   config.get('DIRS', 'snippet_dir'))
        create_dir(self.snippet_dir)
        self.doc_scores = sorted(scores.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:100]
        self.titles = {}
Пример #6
0
 def __init__(self):
     config = load_config()
     self.results_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'ranking'))
     create_dir(self.results_dir)
Пример #7
0
 def load_defaults(self):
     dag_defaults = helpers.load_config("dag_defaults")
     return dag_defaults
Пример #8
0
import time
from src.indexer import Indexer
from src.query_parser import QueryParser
from src.tfidf import TFIDF
from src.sqlm import SQLM
from src.helpers import load_config
from src.prf import PRF
from src.doc_parser import Parser
from src.result_writer import ResultWriter
from src.evaluator import Evaluator
from src.BM25 import BM25
from src.snippet_generator import SnippetGenerator

start_time = time.time()

config = load_config()

parser = Parser()
parser.parse_documents()
parser.stem_parse_documents()

# mode = 0 -> no stemming
# mode = 2 -> stemming
print('Creating index...')
indexer = Indexer(mode=0)
indexer.create()
indexer.save_index()
index = indexer.get_index()
print()

print('Creating stemmed index...')