示例#1
0
 def build(cls):
     ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema)
     writer = ix.writer()
     cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH)
     qdb = QuestionDatabase(QB_QUESTION_DB)
     questions = qdb.questions_with_pages()
     pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES]
     pages = list(qdb.get_all_pages(exclude_test=True))
     print("Building whoosh wiki index from {0} pages".format(len(pages)))
     bar = progressbar.ProgressBar()
     for p in bar(pages):
         writer.add_document(page=p, content=cw[p].content)
     writer.commit()
示例#2
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET,
                                C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET,
                                "data/internal/common/ners", page_dict)
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(False,
                                                  "",
                                                  False,
                                                  question_db,
                                                  False,
                                                  "",
                                                  limit=-1,
                                                  min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature
示例#3
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(
            C.DEEP_DAN_CLASSIFIER_TARGET,
            C.DEEP_DAN_PARAMS_TARGET,
            C.DEEP_VOCAB_TARGET,
            "data/internal/common/ners",
            page_dict
        )
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(
            False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature