Пример #1
0
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1):
    log.info("Training language model with pages that appear more than %i times" % MIN_APPEARANCES)

    lm = LanguageModelWriter(vocab_size, global_lms)
    num_docs = 0
    background = defaultdict(int)
    # Initialize language models
    for title, text in text_iterator(True, QB_WIKI_LOCATION,
                                     True, QB_QUESTION_DB,
                                     True, QB_SOURCE_LOCATION,
                                     max_pages,
                                     min_pages=MIN_APPEARANCES):
        num_docs += 1
        if num_docs % 500 == 0:
            log.info("{} {}".format(unidecode(title), num_docs))
            log.info(str(list(lm.tokenize_without_censor(text[100:200]))))

        for tt in lm.tokenize_without_censor(text):
            background[tt] += 1

    # Create the vocabulary
    for ii in background:
        lm.train_seen(ii, background[ii])
    vocab = lm.finalize()
    log.info(str(vocab)[:80])
    log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs))
    del background

    # Train the language model
    doc_num = 0
    for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)
                                     ]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki, QB_WIKI_LOCATION,
                                         qb, QB_QUESTION_DB,
                                         source, QB_SOURCE_LOCATION,
                                         max_pages,
                                         min_pages=MIN_APPEARANCES):
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                log.info("Adding train doc %i, %s (%s)" % (doc_num, unidecode(title), corpus))
                start = time.time()
            lm.add_train(corpus, title, text)

    log.info("Done training")
    if lm_out:
        # Create the extractor object and write out the pickle
        with safe_open("%s.txt" % lm_out, 'w') as f:
            lm.write_vocab(f)

        for ii, cc in enumerate(lm.corpora()):
            with safe_open("%s/%i" % (lm_out, ii), 'w') as f:
                lm.write_corpus(cc, ii, f)
Пример #2
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET,
                                C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET,
                                "data/internal/common/ners", page_dict)
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(False,
                                                  "",
                                                  False,
                                                  question_db,
                                                  False,
                                                  "",
                                                  limit=-1,
                                                  min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature
Пример #3
0
def instantiate_feature(feature_name: str, question_db: QuestionDatabase):
    """
    @param feature_name: The feature to instantiate
    @param question_db: question database
    """

    feature = None
    print("Loading feature %s ..." % feature_name)
    if feature_name == "lm":
        feature = LanguageModel(data_path(CLM_PATH))
    elif feature_name == "deep":
        page_dict = {}
        for page in question_db.get_all_pages():
            page_dict[page.lower().replace(' ', '_')] = page
        feature = DeepExtractor(
            C.DEEP_DAN_CLASSIFIER_TARGET,
            C.DEEP_DAN_PARAMS_TARGET,
            C.DEEP_VOCAB_TARGET,
            "data/internal/common/ners",
            page_dict
        )
    elif feature_name == "wikilinks":
        feature = WikiLinks()
    elif feature_name == "answer_present":
        feature = AnswerPresent()
    elif feature_name == "label":
        feature = Labeler(question_db)
    elif feature_name == "classifier":
        # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same
        feature = Classifier(question_db)
    elif feature_name == "mentions":
        answers = set(x for x, y in text_iterator(
            False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES))
        feature = Mentions(answers)
    else:
        log.info("Don't know what to do with %s" % feature_name)
    log.info("done")
    return feature
Пример #4
0
 def __init__(self):
     super().__init__()
     question_db = QuestionDatabase(QB_QUESTION_DB)
     answers = set(x for x, y in text_iterator(False,
                                               "",
                                               False,
                                               question_db,
                                               False,
                                               "",
                                               limit=-1,
                                               min_pages=conf['mentions']
                                               ['min_appearances']))
     self.answers = answers
     self.initialized = False
     self.refex_count = defaultdict(int)
     self.refex_lookup = defaultdict(set)
     self._lm = None
     self.generate_refexs(self.answers)
     self.pre = []
     self.ment = []
     self.suf = []
     self.text = ""
     self.kenlm_path = data_path(KEN_LM)
Пример #5
0
                        default="/Volumes/Documents/research_data/wikisource/en/*/*",
                        help="Location of wiki cache")
    parser.add_argument("--plot_location", type=str,
                        default="/Volumes/Documents/research_data/plots/*",
                        help="Location of plot summaries")
    parser.add_argument("--min_answers", type=int, default=0,
                        help="How many times does an answer need to appear to be included")
    parser.add_argument("--output_path", type=str, default="data/source",
                        help="How many pages to add to the index")
    flags = parser.parse_args()

    # Get the pages that we want to use

    answers = set(title for title, text
                  in text_iterator(False, "",
                                   True, flags.question_db,
                                   False, "",
                                   -1, min_pages=flags.min_answers))

    pages = defaultdict(str)
    for ii in glob(flags.plot_location):
        text = unidecode(gzip.open(ii, 'r').read())
        pages[ii.split("/")[-1].replace(".txt.gz", "")] = text

    print(pages.keys()[:5], pages[pages.keys()[0]][:60])

    for ii in glob(flags.wiki_location):
        for jj, tt, cc in read_wiki(ii):
            match = match_page(tt, answers)
            if match:
                pages[unidecode(match)] += "\n\n\n"
                pages[unidecode(match)] += unidecode(cc)
Пример #6
0
        type=int,
        default=0,
        help="How many times does an answer need to appear to be included")
    parser.add_argument("--output_path",
                        type=str,
                        default="data/source",
                        help="How many pages to add to the index")
    flags = parser.parse_args()

    # Get the pages that we want to use

    answers = set(
        title for title, text in text_iterator(False,
                                               "",
                                               True,
                                               flags.question_db,
                                               False,
                                               "",
                                               -1,
                                               min_pages=flags.min_answers))

    pages = defaultdict(str)
    for ii in glob(flags.plot_location):
        text = unidecode(gzip.open(ii, 'r').read())
        pages[ii.split("/")[-1].replace(".txt.gz", "")] = text

    print(pages.keys()[:5], pages[pages.keys()[0]][:60])

    for ii in glob(flags.wiki_location):
        for jj, tt, cc in read_wiki(ii):
            match = match_page(tt, answers)
            if match:
Пример #7
0
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1):
    min_appearances = conf['clm']['min_appearances']
    log.info(
        "Training language model with pages that appear more than %i times" %
        min_appearances)

    lm = LanguageModelWriter(vocab_size, global_lms)
    num_docs = 0
    background = defaultdict(int)
    # Initialize language models
    for title, text in text_iterator(True,
                                     QB_WIKI_LOCATION,
                                     True,
                                     QB_QUESTION_DB,
                                     True,
                                     QB_SOURCE_LOCATION,
                                     max_pages,
                                     min_pages=min_appearances):
        num_docs += 1
        if num_docs % 500 == 0:
            log.info("{} {}".format(title, num_docs))
            log.info(str(list(lm.tokenize_without_censor(text[100:200]))))

        for tt in lm.tokenize_without_censor(text):
            background[tt] += 1

    # Create the vocabulary
    for ii in background:
        lm.train_seen(ii, background[ii])
    vocab = lm.finalize()
    log.info(str(vocab)[:80])
    log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs))
    del background

    # Train the language model
    doc_num = 0
    for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki,
                                         QB_WIKI_LOCATION,
                                         qb,
                                         QB_QUESTION_DB,
                                         source,
                                         QB_SOURCE_LOCATION,
                                         max_pages,
                                         min_pages=min_appearances):
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                log.info("Adding train doc %i, %s (%s)" %
                         (doc_num, title, corpus))
                start = time.time()
            lm.add_train(corpus, title, text)

    log.info("Done training")
    if lm_out:
        # Create the extractor object and write out the pickle
        with safe_open("%s.txt" % lm_out, 'w') as f:
            lm.write_vocab(f)

        for ii, cc in enumerate(lm.corpora()):
            with safe_open("%s/%i" % (lm_out, ii), 'w') as f:
                lm.write_corpus(cc, ii, f)