def build(documents: Dict[str, str]): Answer.init() cw = CachedWikipedia() bar = progressbar.ProgressBar() for page in bar(documents): answer = Answer(page=page, wiki_content=cw[page].content, qb_content=documents[page]) answer.save()
def training_data(self) -> TrainingData: cw = CachedWikipedia() wiki_content = [] wiki_answers = [] for ans in self.answers: wiki_page = cw[ans] if len(wiki_page.content) != 0: # Take the first paragraph, skipping the initial title and empty line after paragraphs = wiki_page.content.split('\n') if len(paragraphs) > 2: n_used = 0 for par in paragraphs[2:]: if len(par) != 0: n_used += 1 content = unidecode(par).lower() # Strip references to the title in a reasonable way ans_words = unidecode(ans).lower().split('_') for w in ans_words: content = content.replace(w, ' ') # Fix up whitespaces content = re.sub('\s+', ' ', content).strip() for sent in nltk.sent_tokenize(content): wiki_content.append([sent]) wiki_answers.append(ans) if n_used == self.n_paragraphs: break return wiki_content, wiki_answers, None
def generate_domain_classifier_data(weight=150): """ Reads all sentences from every wikipedia page corresponding to a known answer and splits them into two vowpal wabbit files, interleaving true quiz bowl questions randomly and with higher weight specified by the weight arg. """ qb_data = QuizBowlDataset(guesser_train=True).training_data() real_questions = [('1', str(weight), ans, clean_question(sent)) for q, ans, _ in zip(*qb_data) for sent in q] pages = set(a for _, _, a, _ in real_questions) cw = CachedWikipedia() # Split wikipedia questions into two sets wiki_questions = ([], []) use_second = False for page in pages: for sentence in sentences_from_page(cw[page]): q = clean_question(sentence) wiki_questions[use_second].append(('-1', '1', page, q)) use_second = not use_second vw_line = '{} {} \'{}|text {}\n' for i, wiki_qs in enumerate(wiki_questions): # Create list of True/False and shuffle to define ordering of train data order = list( chain(repeat(False, len(real_questions)), repeat(True, len(wiki_qs)))) random.shuffle(order) iters = (iter(real_questions), iter(wiki_qs)) with safe_open(DOMAIN_TARGET_PREFIX + str(i), 'w') as f: for choice in order: f.write(vw_line.format(*next(iters[choice])))
def build_many_docs(pages, documents, use_wiki=True, use_qb=True, use_source=False, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info('Deleting index: {}'.format(INDEX_NAME)) ElasticSearchIndex.delete() if ElasticSearchIndex.exists(): log.info('Index {} exists'.format(INDEX_NAME)) else: log.info('Index {} does not exist'.format(INDEX_NAME)) Answer.init() log.info( 'Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') bar = progressbar.ProgressBar() for page, doc in bar(documents): Answer(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') cw = CachedWikipedia() bar = progressbar.ProgressBar() for page in bar(pages): content = word_tokenize(cw[page].content) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: Answer( page=page, wiki_content=' '.join(chunked_content)).save()
def text_iterator(use_wiki, wiki_location, use_qb, qb_location, use_source, source_location, limit=-1, min_pages=0, country_list=COUNTRY_LIST_PATH): if isinstance(qb_location, str): qdb = QuestionDatabase(qb_location) else: qdb = qb_location doc_num = 0 cw = CachedWikipedia(wiki_location, data_path(country_list)) pages = qdb.questions_with_pages() for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[p]) < min_pages: continue if use_qb: train_questions = [x for x in pages[p] if x.fold == "train"] question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions) else: question_text = '' if use_source: filename = '%s/%s' % (source_location, p) if os.path.isfile(filename): try: with gzip.open(filename, 'rb') as f: source_text = f.read() except zlib.error: log.info("Error reading %s" % filename) source_text = '' else: source_text = '' else: source_text = u'' if use_wiki: wikipedia_text = cw[p].content else: wikipedia_text = u"" total_text = wikipedia_text total_text += "\n" total_text += question_text total_text += "\n" total_text += str(source_text) yield p, total_text doc_num += 1 if 0 < limit < doc_num: break
def __init__(self, xml_location="data/external/wikifier/data/output", wikipedia="data/external/wikipedia", country_list=COUNTRY_LIST_PATH): super(WikiLinks, self).__init__() self.name = "wikilinks" self._location = xml_location self.links = defaultdict(dict) self._wiki = CachedWikipedia(wikipedia, country_list) self._cache = -1 self._matches = None
def build(cls): ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema) writer = ix.writer() cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH) qdb = QuestionDatabase(QB_QUESTION_DB) questions = qdb.questions_with_pages() pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES] pages = list(qdb.get_all_pages(exclude_test=True)) print("Building whoosh wiki index from {0} pages".format(len(pages))) bar = progressbar.ProgressBar() for p in bar(pages): writer.add_document(page=p, content=cw[p].content) writer.commit()
def build_lm_data(path, output): cw = CachedWikipedia(path, "") o = open(output, 'w') count = 0 for i in [x.split("/")[-1] for x in glob("%s/*" % path)]: count += 1 if count % 1000 == 0: print("%i\t%s" % (count, unidecode(i))) page = cw[i] for ss in nltk.sent_tokenize(page.content): o.write("%s\n" % " ".join(kTOKENIZER(unidecode(ss.lower()))))
def build(cls, documents: Dict[str, str], index_path=WHOOSH_WIKI_INDEX_PATH): ix = index.create_in(safe_path(index_path), cls.schema) writer = ix.writer() cw = CachedWikipedia() print("Building whoosh wiki index from {0} pages".format( len(documents))) bar = progressbar.ProgressBar() for p in bar(documents): writer.add_document(page=p, content=cw[p].content, quiz_bowl=documents[p]) writer.commit()
def create_memory_index(): dataset = QuizBowlDataset(guesser_train=True) training_data = dataset.training_data() answers = set(training_data[1]) cw = CachedWikipedia() try: Index('mem').delete() except: pass Answer.init() all_wiki_pages = [cw[page] for page in answers] wiki_pages = [p for p in all_wiki_pages if p.content != ''] sc = create_spark_context() sc.parallelize(wiki_pages, 1000).foreach(index_page)
def training_data(self): cw = CachedWikipedia(QB_WIKI_LOCATION) ds = QuizBowlDataset(2) train_data = ds.training_data() answer_classes = set(train_data[1]) train_x = [] train_y = [] for page in answer_classes: sentences = list(wiki_sentences(cw[page].content)) sampled_sentences = random.sample( sentences, min(len(sentences), self.max_sentences)) training_examples = [] for sentence in sampled_sentences: training_examples.append(sentence) train_x.append(training_examples) train_y.append(page) return train_x, train_y, None
def build(documents: Dict[str, str], is_human_map): try: Index('qb').delete() except elasticsearch.exceptions.NotFoundError: log.info( 'Could not delete non-existent index, creating new index...') Answer.init() cw = CachedWikipedia() bar = progressbar.ProgressBar() for page in bar(documents): if page in is_human_map: is_human = is_human_map[page] else: is_human = False answer = Answer(page=page, wiki_content=cw[page].content, qb_content=documents[page], is_human=is_human) answer.save()
def build_large_docs(documents: Dict[str, str], use_wiki=True, use_qb=True, use_source=False, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info('Deleting index: {}'.format(INDEX_NAME)) ElasticSearchIndex.delete() if ElasticSearchIndex.exists(): log.info('Index {} exists'.format(INDEX_NAME)) else: log.info('Index {} does not exist'.format(INDEX_NAME)) Answer.init() cw = CachedWikipedia() source = Source() log.info( 'Indexing questions and corresponding wikipedia pages as large docs...' ) bar = progressbar.ProgressBar() for page in bar(documents): if use_wiki: wiki_content = cw[page].content else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' if use_source: source_content = source[page][:50000] else: source_content = '' answer = Answer(page=page, wiki_content=wiki_content, qb_content=qb_content, source_content=source_content) answer.save()
def build(documents: Dict[str, str], instance_of_map, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info('Deleting index: {}'.format(INDEX_NAME)) ElasticSearchIndex.delete() if ElasticSearchIndex.exists(): log.info( 'Index {} exists, skipping building index'.format(INDEX_NAME)) else: log.info('Index {} does not exist, building index...'.format( INDEX_NAME)) Answer.init() cw = CachedWikipedia() bar = progressbar.ProgressBar() for page in bar(documents): if page in instance_of_map: instance_of = instance_of_map[page] else: instance_of = NO_MATCH answer = Answer(page=page, wiki_content=cw[page].content, qb_content=documents[page], instance_of=instance_of) answer.save()
def init_wiki_cache(wiki_cache): CachedWikipedia.initialize_cache(wiki_cache)
for ii in progress: log.info("MAP %s: %s" % (ii, progress[ii].most_common(5))) for ii in folds: log.info("PB FOLD %s: %i" % (ii, folds[ii])) log.info("Added %i, skipped %i" % (last_id, num_skipped)) if flags.guess: if not os.path.exists(flags.wiki_title): import urllib urllib.request.urlretrieve( "http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz", flags.wiki_title) tf = TitleFinder(flags.wiki_title, CachedWikipedia(), pa.known_pages(), QuestionDatabase.normalize_answer) guesses = tf.best_guess(unmapped) else: guesses = dict((x, "") for x in unmapped) wiki_total = Counter() wiki_answers = defaultdict(set) for ii in guesses: page = guesses[ii] wiki_total[page] += unmapped[ii] wiki_answers[page].add(ii) for ii in [x for x in unmapped if not x in guesses]: wiki_answers[''].add(ii)
if last_id % 1000 == 0: progress = pa.get_counts() for ii in progress: log.info("MAP %s: %s" % (ii, progress[ii].most_common(5))) for ii in folds: log.info("PB FOLD %s: %i" % (ii, folds[ii])) log.info("Added %i, skipped %i" % (last_id, num_skipped)) if flags.guess: if not os.path.exists(flags.wiki_title): import urllib urllib.request.urlretrieve("http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz", flags.wiki_title) tf = TitleFinder(flags.wiki_title, CachedWikipedia(), pa.known_pages(), QuestionDatabase.normalize_answer) guesses = tf.best_guess(unmapped) else: guesses = dict((x, "") for x in unmapped) wiki_total = Counter() wiki_answers = defaultdict(set) for ii in guesses: page = guesses[ii] wiki_total[page] += unmapped[ii] wiki_answers[page].add(ii) for ii in [x for x in unmapped if not x in guesses]:
type=str, default='data/internal/page_assignment/ambiguous/') parser.add_argument('--unambiguous_path', type=str, default='data/internal/page_assignment/unambiguous/') flags = parser.parse_args() pa = PageAssigner(QuestionDatabase.normalize_answer) for ii in glob("%s/*" % flags.ambiguous_path): pa.load_ambiguous(ii) for ii in glob("%s/*" % flags.unambiguous_path): pa.load_unambiguous(ii) for ii in glob("%s/*" % flags.direct_path): pa.load_direct(ii) cw = CachedWikipedia() tf = TitleFinder("data/enwiki-latest-all-titles-in-ns0.gz", cw, pa.known_pages(), normalize=QuestionDatabase.normalize_answer) for ii in [ 'die leiden des jungen werthers', '99 Luftballons', 'saint nicholas of myra', 'édouard roche', 'the mahdi or mohammad ahmed', 'the first vatican council', 'antietam national battlefield', 'cia', 'samuel f b morse', 'the passion according to st matthew or st matthew’s passion or matthäuspassion', 'another world', 'rolling in the deep', 'tony gwynn', 'opal', 'tylenol', 'queues', 'dachau', 'lipoproteins', 'haiku', 'japan', 'zoroastrianism' ]: