def build(cls): ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema) writer = ix.writer() cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH) qdb = QuestionDatabase(QB_QUESTION_DB) questions = qdb.questions_with_pages() pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES] pages = list(qdb.get_all_pages(exclude_test=True)) print("Building whoosh wiki index from {0} pages".format(len(pages))) bar = progressbar.ProgressBar() for p in bar(pages): writer.add_document(page=p, content=cw[p].content) writer.commit()
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator(False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor( C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict ) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator( False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature