def __init__(self): config = load_config() self.query_file = abspath(config.get('DIRS', 'data_dir'), config.get('FILES', 'query_file')) self.stem_query_file = abspath(config.get('DIRS', 'data_dir'), config.get('FILES', 'stem_query_file')) self.data_parser = DataParser()
def __init__(self, model): config = load_config() data_dir = config.get('DIRS', 'data_dir') stopwords_file = abspath(data_dir, config.get('FILES', 'common_words')) corpus_dir = config.get('DIRS', 'corpus_dir') self.stopwords = read_file(stopwords_file).split('\n') self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir')) self.model = model
def __init__(self): config = load_config() corpus_dir = config.get('DIRS', 'corpus_dir') self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs')) self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir')) self.data_parser = DataParser() create_dir(self.parsed_dir) self.parsed_content = "" self.raw_corpus = os.listdir(self.raw_docs) self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir')) self.stem_file = abspath(config.get('DIRS', 'data_dir'), config.get('FILES', 'stemmed_docs')) create_dir(self.stem_dir) self.docs = []
def __init__(self, file_name, run_name): config = load_config() self.run_name = run_name self.results_file_path = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'ranking'), file_name) self.eval_dir_path = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'eval_dir'), run_name) self.file_name = file_name self.rel_data = get_relevance_data() self.run = self.get_run() self.precision = {} self.p_at_5 = {} self.p_at_20 = {} self.recall = {} self.ap = {} self.map = 0.0 self.rr = {} self.mrr = 0.0
def __init__(self, query, scores): config = load_config() self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'), config.get('DIRS', 'raw_docs')) self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'), config.get('DIRS', 'parsed_dir')) self.stoplist = get_stoplist() self.significant_words = set( [term for term in query.split() if term not in self.stoplist]) self.dataparser = DataParser() self.snippets = {} self.snippet_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'snippet_dir')) create_dir(self.snippet_dir) self.doc_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:100] self.titles = {}
def __init__(self): config = load_config() self.results_dir = abspath(config.get('DIRS', 'results'), config.get('DIRS', 'ranking')) create_dir(self.results_dir)
def load_defaults(self): dag_defaults = helpers.load_config("dag_defaults") return dag_defaults
import time from src.indexer import Indexer from src.query_parser import QueryParser from src.tfidf import TFIDF from src.sqlm import SQLM from src.helpers import load_config from src.prf import PRF from src.doc_parser import Parser from src.result_writer import ResultWriter from src.evaluator import Evaluator from src.BM25 import BM25 from src.snippet_generator import SnippetGenerator start_time = time.time() config = load_config() parser = Parser() parser.parse_documents() parser.stem_parse_documents() # mode = 0 -> no stemming # mode = 2 -> stemming print('Creating index...') indexer = Indexer(mode=0) indexer.create() indexer.save_index() index = indexer.get_index() print() print('Creating stemmed index...')