def process_file(filename): with open(filename, 'r') as f: questions = defaultdict(set) for line in f: tokens = line.split() offset = 1 if int(tokens[0]) == -1 else 0 ident = tokens[1 + offset].replace("'", "").split('_') q = int(ident[0]) s = int(ident[1]) t = int(ident[2]) guess = tokens[3 + offset] questions[(q, s, t)].add(guess) qdb = QuestionDatabase('data/questions.db') answers = qdb.all_answers() recall = 0 warn = 0 for ident, guesses in questions.items(): if len(guesses) < N_GUESSES: print("WARNING LOW GUESSES") print('Question {0} is missing guesses, only has {1}'.format(ident, len(guesses))) warn += 1 correct = answers[ident[0]].replace(' ', '_') in guesses recall += correct print('Recall: {0} Total: {1}'.format(recall / len(questions), len(questions))) print('Warned lines: {0}'.format(warn))
def generate(min_count, qdb, pred_file, meta_file, output): database = QuestionDatabase(qdb) data = load_data(pred_file, meta_file, database) dan_answers = set(database.page_by_count(min_count=min_count, exclude_test=True)) answers = compute_answers(data, dan_answers) stats = compute_statistics(answers).cache() stats.to_json(output, root_array=False) pp = pprint.PrettyPrinter() pp.pprint(stats)
def initialize_cache(path): """ This function iterates over all pages and accessing them in the cache. This forces a prefetch of all wiki pages """ db = QuestionDatabase(QB_QUESTION_DB) pages = db.questions_with_pages() cw = CachedWikipedia(path) for p in pages: cw[p].content
def build(cls): ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema) writer = ix.writer() cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH) qdb = QuestionDatabase(QB_QUESTION_DB) questions = qdb.questions_with_pages() pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES] pages = list(qdb.get_all_pages(exclude_test=True)) print("Building whoosh wiki index from {0} pages".format(len(pages))) bar = progressbar.ProgressBar() for p in bar(pages): writer.add_document(page=p, content=cw[p].content) writer.commit()
def wikify(output_directory): database = QuestionDatabase(QB_QUESTION_DB) pages = database.questions_with_pages() total = 0 for p in pages: if len(pages[p]) >= MIN_APPEARANCES: print(p, len(pages[p])) for q in pages[p]: total += 1 for sentence, word, text in q.partials(): sentence -= 1 with open("%s/%i-%i.txt" % (output_directory, q.qnum, sentence), 'w') as output: output.write("%s\n" % unidecode(text[sentence])) print(total)
def wikify(output_directory): database = QuestionDatabase(QB_QUESTION_DB) pages = database.questions_with_pages() total = 0 for p in pages: if len(pages[p]) >= MIN_APPEARANCES: print(p, len(pages[p])) for q in pages[p]: total += 1 for sentence, word, text in q.partials(): sentence -= 1 with open( "%s/%i-%i.txt" % (output_directory, q.qnum, sentence), 'w') as output: output.write("%s\n" % unidecode(text[sentence])) print(total)
def compute_stats(): qdb = QuestionDatabase(QB_QUESTION_DB) ir = IrExtractor() questions = qdb.guess_questions() test_guesses = pseq(questions, partition_size=100)\ .filter(lambda q: q.fold == 'test')\ .map(lambda q: (q.page, ir.text_guess(q.flatten_text()))) correct = 0 close = 0 total = 0 for page, guesses in test_guesses: top_guess = max(guesses.items(), key=lambda x: x[1], default=None) if top_guess is not None and page == top_guess[0]: correct += 1 elif page in guesses: close += 1 total += 1 print("Total Correct: {0}, Percent Correct: {1}".format(correct, correct / total)) print("Total Close: {0}, Percent Close: {1}".format(close, close / total))
def compute_stats(): qdb = QuestionDatabase(QB_QUESTION_DB) ir = IrExtractor() questions = qdb.guess_questions() test_guesses = pseq(questions, partition_size=100)\ .filter(lambda q: q.fold == 'test')\ .map(lambda q: (q.page, ir.text_guess(q.flatten_text()))) correct = 0 close = 0 total = 0 for page, guesses in test_guesses: top_guess = max(guesses.items(), key=lambda x: x[1], default=None) if top_guess is not None and page == top_guess[0]: correct += 1 elif page in guesses: close += 1 total += 1 print("Total Correct: {0}, Percent Correct: {1}".format( correct, correct / total)) print("Total Close: {0}, Percent Close: {1}".format(close, close / total))
def build_classifier(class_type, output: str, bigram_thresh=1000): questions = QuestionDatabase(QB_QUESTION_DB) bigram_filename = "output/classifier/%s/bigrams.pkl" % class_type if os.path.exists(bigram_filename): bgset = pickle.load(open(bigram_filename, 'rb')) print("Using previous bigrams") else: print("computing bigrams...") bgset = compute_frequent_bigrams(bigram_thresh, questions) write_bigrams(bgset, bigram_filename) train_classifier(output, bgset, questions, class_type) evaluate(output, bgset, questions, class_type)
def create_guesses(guess_db_path, processes=cpu_count()): q_db = QuestionDatabase(QB_QUESTION_DB) guess_list = GuessList(guess_db_path) deep_feature = instantiate_feature('deep', q_db) questions = q_db.guess_questions() tasks = [] for q in questions: tasks.append((q, deep_feature)) with Pool(processes=processes) as pool: question_guesses = pool.imap(parallel_generate_guesses, tasks) i, n = 0, len(tasks) log.info("Guess generation starting for {0} questions".format(n)) for qnum, fold, guesses in question_guesses: guess_list.save_guesses('deep', qnum, fold, guesses) log.info("Progress: {0} / {1} questions completed".format(i, n)) i += 1 log.info("Guess generation completed, generating indices") guess_list.create_indexes() log.info("Guess generation done")
def spark_batch(sc: SparkContext, feature_names, question_db: str, guess_db: str, granularity='sentence'): sql_context = SQLContext(sc) question_db = QuestionDatabase(question_db) log.info("Loading Questions") questions = question_db.guess_questions() log.info("Loading Guesses") guess_list = GuessList(guess_db) guess_lookup = guess_list.all_guesses(allow_train=True) log.info("Loading tasks") tasks = [Task(q, guess_lookup[q.qnum]) for q in questions] shuffle(tasks) log.info("Number of tasks: {0}".format(len(tasks))) features = { name: instantiate_feature(name, question_db) for name in feature_names } b_features = sc.broadcast(features) def f_eval(x): return evaluate_feature_question(x, b_features, granularity) log.info("Beginning feature job") feature_rdd = sc.parallelize(tasks)\ .repartition(150 * len(feature_names))\ .flatMap(f_eval) feature_df = sql_context.createDataFrame(feature_rdd, SCHEMA).cache() feature_df.count() log.info("Beginning write job") for fold in FOLDS: feature_df_with_fold = feature_df.filter( 'fold = "{0}"'.format(fold)).cache() for name in feature_names: filename = 'output/features/{0}/sentence.{1}.parquet'.format( fold, name) os.makedirs(os.path.dirname(filename), exist_ok=True) feature_df_with_fold.filter('feature_name = "{0}"'.format(name))\ .write.save(filename, mode='overwrite') feature_df_with_fold.unpersist() log.info("Computation Completed, stopping Spark") sc.stop()
def start_spark_streaming(): question_db = QuestionDatabase(QB_QUESTION_DB) features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES} sc = create_sc() b_features = sc.broadcast(features) ssc = StreamingContext(sc, 5) ssc.socketTextStream('localhost', 9999) \ .repartition(QB_STREAMING_CORES - 1) \ .flatMap(lambda line: generate_guesses(line, b_features)) \ .map(lambda sg: evaluate_features(sg, b_features)) \ .foreachRDD(score_and_save) ssc.start() ssc.awaitTermination() sc.stop()
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor(C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator(False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature
def instantiate_feature(feature_name: str, question_db: QuestionDatabase): """ @param feature_name: The feature to instantiate @param question_db: question database """ feature = None print("Loading feature %s ..." % feature_name) if feature_name == "lm": feature = LanguageModel(data_path(CLM_PATH)) elif feature_name == "deep": page_dict = {} for page in question_db.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page feature = DeepExtractor( C.DEEP_DAN_CLASSIFIER_TARGET, C.DEEP_DAN_PARAMS_TARGET, C.DEEP_VOCAB_TARGET, "data/internal/common/ners", page_dict ) elif feature_name == "wikilinks": feature = WikiLinks() elif feature_name == "answer_present": feature = AnswerPresent() elif feature_name == "label": feature = Labeler(question_db) elif feature_name == "classifier": # TODO: Change this to depend on any given bigrams.pkl, which are atm all the same feature = Classifier(question_db) elif feature_name == "mentions": answers = set(x for x, y in text_iterator( False, "", False, question_db, False, "", limit=-1, min_pages=MIN_APPEARANCES)) feature = Mentions(answers) else: log.info("Don't know what to do with %s" % feature_name) log.info("done") return feature
def text_iterator(use_wiki, wiki_location, use_qb, qb_location, use_source, source_location, limit=-1, min_pages=0, country_list=COUNTRY_LIST_PATH): if isinstance(qb_location, str): qdb = QuestionDatabase(qb_location) else: qdb = qb_location doc_num = 0 cw = CachedWikipedia(wiki_location, data_path(country_list)) pages = qdb.questions_with_pages() for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[pp]) < min_pages: continue if use_qb: train_questions = [x for x in pages[pp] if x.fold == "train"] question_text = u"\n".join(u" ".join(x.raw_words()) for x in train_questions) else: question_text = u'' if use_source: filename = '%s/%s' % (source_location, pp) if os.path.isfile(filename): try: with gzip.open(filename, 'rb') as f: source_text = f.read() except zlib.error: print("Error reading %s" % filename) source_text = '' else: source_text = '' else: source_text = u'' if use_wiki: wikipedia_text = cw[pp].content else: wikipedia_text = u"" total_text = wikipedia_text total_text += "\n" total_text += question_text total_text += "\n" total_text += unidecode(str(source_text)) yield pp, total_text doc_num += 1 if 0 < limit < doc_num: break
import operator kBAD_ANSWERS = ["", "red river", "the", "figaro", "normal", "s", "p"] if __name__ == "__main__": args = argparse.ArgumentParser('Interactive assign pages to questions') args.add_argument('--database', type=str, default='data/questions.db', help='sqlite3 database of questions') args.add_argument('--titles', type=str, default='data/wiki_index.pkl', help='page title candiates') args.add_argument('--labels', type=str, default='data/map/ans_to_wiki', help='write page assignment answers') args = args.parse_args() # Open up the database d = QuestionDatabase(args.database) page_diversity = d.answer_map(normalize) # Set up the active learner for writing assignments al = ActiveLearner(None, args.labels) existing_labels = set(x[0] for x in al.human_labeled()) # get the candidates we want to assign to pages answers = d.unmatched_answers(existing_labels) print(answers.keys()[:10]) # Open up the title finder tf = TitleFinder(open(args.titles)) for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()), reverse=True):
first, rest = ii.split('ID="', 1) id, rest = rest.split('" TITLE="', 1) title, rest = rest.split('"', 1) self.topics[int(id)] = title if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Import questions') parser.add_argument('--naqt_path', type=str) parser.add_argument('--db', type=str, default='data/questions.db') flags = parser.parse_args() qdb = QuestionDatabase(flags.db) conn = qdb._conn answer_map = qdb.answer_map() # Find existing naqt questions c = conn.cursor() command = 'SELECT naqt FROM questions WHERE naqt >= 0;' c.execute(command) existing = set(int(x[0]) for x in c) num_skipped = 0 last_id = kNAQT_START if flags.naqt_path: for qq in naqt_reader(flags.naqt_path): if qq.answer in answer_map and len(answer_map[qq.answer]) == 1: page = answer_map[qq.answer].keys()[0]
import time from page_assignment.active_learning_for_matching import ActiveLearner from qanta.util.qdb import QuestionDatabase if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="apply wikipedia pages") parser.add_argument("--db", default='data/questions.db', type=str, help="The question database") parser.add_argument("--match_location", type=str, default='data/map/ans_to_wiki_', help="Where we read matches learned") flags = parser.parse_args() start = time.time() print("Loading db..") db = QuestionDatabase(flags.db) print("Loading classifier...") classifier = ActiveLearner(None, flags.match_location, []) for question, page in classifier.human_labeled(): ans_type = "" db.set_answer_page(question, page, ans_type) print(question, page, "GIVEN", ans_type)
def main(): import argparse parser = argparse.ArgumentParser(description='') default_path = 'data/' parser.add_argument('--question_db', type=str, default=default_path + 'questions.db') parser.add_argument('--guess_db', type=str, default=default_path + 'guesses.db', help="Guess database") parser.add_argument("--num_choices", type=int, default=4, help="How many choices do we write") parser.add_argument("--train_out", type=str, default="sci_train.csv") parser.add_argument("--test_out", type=str, default="sci_test.csv") parser.add_argument("--key_out", type=str, default="sci_key.csv") flags = parser.parse_args() # Create database connections print("Opening %s" % flags.question_db) question_database = sqlite3.connect(flags.question_db) guess_database = sqlite3.connect(flags.guess_db) # First get answers of interest and put them in a dictionary where the value is their count query = 'select page from questions where page != "" and (' query += " or ".join("category='%s'" % x for x in CATEGORIES) query += ")" c = question_database.cursor() print(query) c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF)) print(len(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < COUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num) print(choices) answer_choices = [ "answer%s" % CHOICEIDS[x] for x in range(flags.num_choices) ] train_out = DictWriter(open(flags.train_out, 'w'), ["id", "question", "correctAnswer"] + answer_choices) train_out.writeheader() test_out = DictWriter(open(flags.test_out, 'w'), ["id", "question"] + answer_choices) test_out.writeheader() key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"]) key_out.writeheader() # Now write the questions out for qq in questions.values(): print(qq.fold) if qq.fold == "devtest": test_out.writerow(qq.csv_line(CHOICEIDS, "test")) key_out.writerow(qq.csv_line(CHOICEIDS, "key")) else: train_out.writerow(qq.csv_line(CHOICEIDS, "train"))
args.add_argument('--database', type=str, default='data/questions.db', help='sqlite3 database of questions') args.add_argument('--titles', type=str, default='data/wiki_index.pkl', help='page title candiates') args.add_argument('--labels', type=str, default='data/map/ans_to_wiki', help='write page assignment answers') args = args.parse_args() # Open up the database d = QuestionDatabase(args.database) page_diversity = d.answer_map(normalize) # Set up the active learner for writing assignments al = ActiveLearner(None, args.labels) existing_labels = set(x[0] for x in al.human_labeled()) # get the candidates we want to assign to pages answers = d.unmatched_answers(existing_labels) print(answers.keys()[:10]) # Open up the title finder tf = TitleFinder(open(args.titles)) for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()),
return seen if __name__ == "__main__": from util import flags flags.define_string("title_index", None, "Pickle of all titles") flags.define_string("label_path", None, "Where we write page associations") flags.define_string("database", None, "Question database") flags.define_string("performance_output", None, "Where we write user performance") flags.define_string("user", None, "User identifier") flags.InitFlags() seen = already_answered(flags.performance_output, flags.user) al = ActiveLearner(None, flags.label_path) print("Loading question db %s" % flags.database) db = QuestionDatabase(flags.database) pw = PerformanceWriter(flags.performance_output, flags.user) tf = TitleFinder(open(flags.title_index)) questions = db.questions_by_tournament("High School Championship") for qid in questions: question = questions[qid] if question.fold == "train" or qid in seen: continue choices = list(tf.query(question.answer)) # Get what and when the human answered wp, idx, ans = get_answer([question.text[x] for x in sorted(question.text)], question.answer, question.page) print("\n".join(question.text.values()))