def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1): log.info("Training language model with pages that appear more than %i times" % MIN_APPEARANCES) lm = LanguageModelWriter(vocab_size, global_lms) num_docs = 0 background = defaultdict(int) # Initialize language models for title, text in text_iterator(True, QB_WIKI_LOCATION, True, QB_QUESTION_DB, True, QB_SOURCE_LOCATION, max_pages, min_pages=MIN_APPEARANCES): num_docs += 1 if num_docs % 500 == 0: log.info("{} {}".format(unidecode(title), num_docs)) log.info(str(list(lm.tokenize_without_censor(text[100:200])))) for tt in lm.tokenize_without_censor(text): background[tt] += 1 # Create the vocabulary for ii in background: lm.train_seen(ii, background[ii]) vocab = lm.finalize() log.info(str(vocab)[:80]) log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs)) del background # Train the language model doc_num = 0 for corpus, qb, wiki, source in [("wiki", False, True, False), ("qb", True, False, False), ("source", False, False, True) ]: # Add training data start = time.time() for title, text in text_iterator(wiki, QB_WIKI_LOCATION, qb, QB_QUESTION_DB, source, QB_SOURCE_LOCATION, max_pages, min_pages=MIN_APPEARANCES): doc_num += 1 if doc_num % 500 == 0 or time.time() - start > 10: log.info("Adding train doc %i, %s (%s)" % (doc_num, unidecode(title), corpus)) start = time.time() lm.add_train(corpus, title, text) log.info("Done training") if lm_out: # Create the extractor object and write out the pickle with safe_open("%s.txt" % lm_out, 'w') as f: lm.write_vocab(f) for ii, cc in enumerate(lm.corpora()): with safe_open("%s/%i" % (lm_out, ii), 'w') as f: lm.write_corpus(cc, ii, f)
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator(False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor( C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict ) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator( False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature
def __init__(self): super().__init__() question_db = QuestionDatabase(QB_QUESTION_DB) answers = set(x for x, y in text_iterator(False, "", False, question_db, False, "", limit=-1, min_pages=conf['mentions'] ['min_appearances'])) self.answers = answers self.initialized = False self.refex_count = defaultdict(int) self.refex_lookup = defaultdict(set) self._lm = None self.generate_refexs(self.answers) self.pre = [] self.ment = [] self.suf = [] self.text = "" self.kenlm_path = data_path(KEN_LM)
default="/Volumes/Documents/research_data/wikisource/en/*/*", help="Location of wiki cache") parser.add_argument("--plot_location", type=str, default="/Volumes/Documents/research_data/plots/*", help="Location of plot summaries") parser.add_argument("--min_answers", type=int, default=0, help="How many times does an answer need to appear to be included") parser.add_argument("--output_path", type=str, default="data/source", help="How many pages to add to the index") flags = parser.parse_args() # Get the pages that we want to use answers = set(title for title, text in text_iterator(False, "", True, flags.question_db, False, "", -1, min_pages=flags.min_answers)) pages = defaultdict(str) for ii in glob(flags.plot_location): text = unidecode(gzip.open(ii, 'r').read()) pages[ii.split("/")[-1].replace(".txt.gz", "")] = text print(pages.keys()[:5], pages[pages.keys()[0]][:60]) for ii in glob(flags.wiki_location): for jj, tt, cc in read_wiki(ii): match = match_page(tt, answers) if match: pages[unidecode(match)] += "\n\n\n" pages[unidecode(match)] += unidecode(cc)
type=int, default=0, help="How many times does an answer need to appear to be included") parser.add_argument("--output_path", type=str, default="data/source", help="How many pages to add to the index") flags = parser.parse_args() # Get the pages that we want to use answers = set( title for title, text in text_iterator(False, "", True, flags.question_db, False, "", -1, min_pages=flags.min_answers)) pages = defaultdict(str) for ii in glob(flags.plot_location): text = unidecode(gzip.open(ii, 'r').read()) pages[ii.split("/")[-1].replace(".txt.gz", "")] = text print(pages.keys()[:5], pages[pages.keys()[0]][:60]) for ii in glob(flags.wiki_location): for jj, tt, cc in read_wiki(ii): match = match_page(tt, answers) if match:
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1): min_appearances = conf['clm']['min_appearances'] log.info( "Training language model with pages that appear more than %i times" % min_appearances) lm = LanguageModelWriter(vocab_size, global_lms) num_docs = 0 background = defaultdict(int) # Initialize language models for title, text in text_iterator(True, QB_WIKI_LOCATION, True, QB_QUESTION_DB, True, QB_SOURCE_LOCATION, max_pages, min_pages=min_appearances): num_docs += 1 if num_docs % 500 == 0: log.info("{} {}".format(title, num_docs)) log.info(str(list(lm.tokenize_without_censor(text[100:200])))) for tt in lm.tokenize_without_censor(text): background[tt] += 1 # Create the vocabulary for ii in background: lm.train_seen(ii, background[ii]) vocab = lm.finalize() log.info(str(vocab)[:80]) log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs)) del background # Train the language model doc_num = 0 for corpus, qb, wiki, source in [("wiki", False, True, False), ("qb", True, False, False), ("source", False, False, True)]: # Add training data start = time.time() for title, text in text_iterator(wiki, QB_WIKI_LOCATION, qb, QB_QUESTION_DB, source, QB_SOURCE_LOCATION, max_pages, min_pages=min_appearances): doc_num += 1 if doc_num % 500 == 0 or time.time() - start > 10: log.info("Adding train doc %i, %s (%s)" % (doc_num, title, corpus)) start = time.time() lm.add_train(corpus, title, text) log.info("Done training") if lm_out: # Create the extractor object and write out the pickle with safe_open("%s.txt" % lm_out, 'w') as f: lm.write_vocab(f) for ii, cc in enumerate(lm.corpora()): with safe_open("%s/%i" % (lm_out, ii), 'w') as f: lm.write_corpus(cc, ii, f)