def create_wikipedia_cache( parsed_wiki_path='data/external/wikipedia/parsed-wiki', output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QuestionDatabase() questions = list(db.all_questions().values()) train_questions = [ q for q in questions if q.fold == 'guesstrain' or q.fold == 'buzzertrain' ] answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, '*', '*') def parse_page(json_text): page = json.loads(json_text) return { 'id': int(page['id']), 'title': page['title'].replace(' ', '_'), 'text': page['text'], 'url': page['url'] } wiki_pages = sc.textFile(page_pattern).map(parse_page).filter( lambda p: p['title'] in b_answers.value).collect() wiki_lookup = {p['title']: p for p in wiki_pages} with open(output_path, 'w') as f: json.dump(wiki_lookup, f) return wiki_lookup
def requires(self): db = QuestionDatabase() questions = list(db.all_questions().values()) n_batches = int(math.ceil(len(questions) / BATCH_SIZE)) for i in range(n_batches): yield TaggedQuestionBatch(question_batch=i)
def create_wikipedia_redirect_pickle(redirect_csv, output_pickle): countries = {} with open(COUNTRY_LIST_PATH) as f: for line in f: k, v = line.split('\t') countries[k] = v.strip() db = QuestionDatabase() pages = set(db.all_answers().values()) with open(redirect_csv) as redirect_f: redirects = {} n_total = 0 n_selected = 0 for row in csv.reader(redirect_f, quotechar='"', escapechar='\\'): n_total += 1 source = row[0] target = row[1] if (target not in pages or source in countries or target.startswith('WikiProject') or target.endswith("_topics") or target.endswith("_(overview)")): continue else: redirects[source] = target n_selected += 1 log.info( 'Filtered {} raw wikipedia redirects to {} matching redirects'. format(n_total, n_selected)) with open(output_pickle, 'wb') as output_f: pickle.dump(redirects, output_f)
def process_file(filename): with open(filename, 'r') as f: questions = defaultdict(set) for line in f: tokens = line.split() offset = 1 if int(tokens[0]) == -1 else 0 ident = tokens[1 + offset].replace("'", "").split('_') q = int(ident[0]) s = int(ident[1]) t = int(ident[2]) guess = tokens[3 + offset] questions[(q, s, t)].add(guess) qdb = QuestionDatabase('data/questions.db') answers = qdb.all_answers() recall = 0 warn = 0 for ident, guesses in questions.items(): if len(guesses) < conf['n_guesses']: log.info("WARNING LOW GUESSES") log.info( 'Question {0} is missing guesses, only has {1}'.format( ident, len(guesses))) warn += 1 correct = answers[ident[0]].replace(' ', '_') in guesses recall += correct log.info('Recall: {0} Total: {1}'.format(recall / len(questions), len(questions))) log.info('Warned lines: {0}'.format(warn))
def generate(min_count, pred_file, meta_file, output): database = QuestionDatabase() data = load_data(pred_file, meta_file, database) dan_answers = set(database.page_by_count(min_count, True)) answers = compute_answers(data, dan_answers) stats = compute_statistics(answers).cache() stats.to_json(safe_path(output), root_array=False) pprint.pprint(stats)
def main(): questions = QuestionDatabase().all_questions() guessdev_questions = {k: v for k, v in questions.items() if v.fold == 'guessdev'} highlights = {} for k, v in tqdm(guessdev_questions.items()): highlights[k] = get_highlights(v.flatten_text()) with open('guessdev_highlight.pkl', 'wb') as f: pickle.dump(highlights, f)
def test(): questions = QuestionDatabase().all_questions() guessdev_questions = [x for x in questions.values() if x.fold == 'guessdev'] highlights = get_highlights(questions[0].flatten_text()) print(highlights['guess']) for x in highlights['wiki']: print('WIKI|' + x.replace('<em>', color.RED).replace('</em>', color.END)) for x in highlights['qb']: print('QUIZ|' + x.replace('<em>', color.RED).replace('</em>', color.END))
def text_iterator(use_wiki, wiki_location, use_qb, qb_location, use_source, source_location, limit=-1, min_pages=0, country_list=COUNTRY_LIST_PATH): if isinstance(qb_location, str): qdb = QuestionDatabase(qb_location) else: qdb = qb_location doc_num = 0 cw = CachedWikipedia(wiki_location, data_path(country_list)) pages = qdb.questions_with_pages() for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[p]) < min_pages: continue if use_qb: train_questions = [x for x in pages[p] if x.fold == "train"] question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions) else: question_text = '' if use_source: filename = '%s/%s' % (source_location, p) if os.path.isfile(filename): try: with gzip.open(filename, 'rb') as f: source_text = f.read() except zlib.error: log.info("Error reading %s" % filename) source_text = '' else: source_text = '' else: source_text = u'' if use_wiki: wikipedia_text = cw[p].content else: wikipedia_text = u"" total_text = wikipedia_text total_text += "\n" total_text += question_text total_text += "\n" total_text += str(source_text) yield p, total_text doc_num += 1 if 0 < limit < doc_num: break
def main(): questions = QuestionDatabase().all_questions() guessdev_questions = { k: v for k, v in questions.items() if v.fold == 'guessdev' } highlights = {} for k, v in tqdm(guessdev_questions.items()): highlights[k] = get_highlights(v.flatten_text()) with open('guessdev_highlight.pkl', 'wb') as f: pickle.dump(highlights, f)
def train_classifier(class_type, question_db=None): if question_db is None: question_db = QuestionDatabase(QB_QUESTION_DB) log.info("Training classifier: {}".format(class_type)) all_questions = question_db.questions_with_pages() train = compute_features(all_questions, 'train', class_type) train_x = train['text'] train_y = train['label'] classifier = pipeline_creators[class_type]().fit(train_x, train_y) return classifier
def initialize_cache(path): """ This function iterates over all pages and accessing them in the cache. This forces a prefetch of all wiki pages """ db = QuestionDatabase(QB_QUESTION_DB) pages = db.questions_with_pages() cw = CachedWikipedia(path) pool = Pool() input_data = [(format_guess(title), cw) for title in pages.keys()] pool.starmap(access_page, input_data)
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \ -> Tuple[Dict[str, int], Dict[str, list]]: # merge_dfs() log.info('Loading data') question_db = QuestionDatabase() quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS, guesser_train=True, buzzer_train=True) all_questions = question_db.all_questions() if not os.path.isfile(bc.OPTIONS_DIR): log.info('Loading the set of options') all_options = set(quizbowl_db.training_data()[1]) id2option = list(all_options) with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile: pickle.dump(id2option, outfile) else: with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile: id2option = pickle.load(infile) option2id = {o: i for i, o in enumerate(id2option)} num_options = len(id2option) log.info('Number of options {0}'.format(len(id2option))) guesses_by_fold = dict() for fold in folds: save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold)) if os.path.isfile(save_dir): with open(safe_path(save_dir), 'rb') as infile: guesses_by_fold[fold] = pickle.load(infile) log.info('Loading {0} guesses'.format(fold)) continue log.info('Processing {0} guesses'.format(fold)) guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) worker = partial(_process_question, option2id, all_questions) inputs = guesses.groupby('qnum') guesses_by_fold[fold] = _multiprocess(worker, inputs, info='df data', multi=True) guesses_by_fold[fold] = [ x for x in guesses_by_fold[fold] if x is not None ] print(len(guesses_by_fold[fold])) with open(safe_path(save_dir), 'wb') as outfile: pickle.dump(guesses_by_fold[fold], outfile) log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir)) return option2id, guesses_by_fold
def test(): questions = QuestionDatabase().all_questions() guessdev_questions = [ x for x in questions.values() if x.fold == 'guessdev' ] highlights = get_highlights(questions[0].flatten_text()) print(highlights['guess']) for x in highlights['wiki']: print('WIKI|' + x.replace('<em>', color.RED).replace('</em>', color.END)) for x in highlights['qb']: print('QUIZ|' + x.replace('<em>', color.RED).replace('</em>', color.END))
def run(self): db = QuestionDatabase(QB_QUESTION_DB) questions = db.all_questions() with open(safe_path(EXPO_QUESTIONS), 'w', newline='') as f: f.write('id,answer,sent,text\n') writer = csv.writer(f, delimiter=',') for q in questions.values(): if q.fold != 'test': continue max_sent = max(q.text.keys()) for i in range(max_sent + 1): writer.writerow( [q.qnum, format_guess(q.page), i, q.text[i]])
def web_initialize_file_cache(path, remote_delay=1): """ Initialize the cache by requesting each page with wikipedia package. This function iterates over all pages and accessing them in the cache. This forces a prefetch of all wiki pages """ db = QuestionDatabase() pages = db.questions_with_pages() cw = CachedWikipedia(path, remote_delay=remote_delay) pool = Pool() input_data = [(title, cw) for title in pages.keys()] pool.starmap(access_page, input_data)
def create_output(path: str): df = read_dfs(path).cache() question_db = QuestionDatabase() answers = question_db.all_answers() for qnum in answers: answers[qnum] = format_guess(answers[qnum]) sc = SparkContext.getOrCreate() # type: SparkContext b_answers = sc.broadcast(answers) def generate_string(group): rows = group[1] result = "" feature_values = [] meta = None qnum = None sentence = None token = None guess = None for name in FEATURE_NAMES: named_feature_list = list( filter(lambda r: r.feature_name == name, rows)) if len(named_feature_list) != 1: raise ValueError( 'Encountered more than one row when there should be exactly one row' ) named_feature = named_feature_list[0] if meta is None: qnum = named_feature.qnum sentence = named_feature.sentence token = named_feature.token guess = named_feature.guess meta = '{} {} {} {}'.format(qnum, named_feature.sentence, named_feature.token, guess) feature_values.append(named_feature.feature_value) assert '@' not in result, \ '@ is a special character that is split on and not allowed in the feature line' vw_features = ' '.join(feature_values) if guess == b_answers.value[qnum]: vw_label = "1 '{}_{}_{} ".format(qnum, sentence, token) else: vw_label = "-1 '{}_{}_{} ".format(qnum, sentence, token) return vw_label + vw_features + '@' + meta for fold in VW_FOLDS: group_features(df.filter(df.fold == fold))\ .map(generate_string)\ .saveAsTextFile('output/vw_input/{0}.vw'.format(fold)) sc.stop()
def task_list(): guess_df = AbstractGuesser.load_all_guesses() question_db = QuestionDatabase() question_map = question_db.all_questions() tasks = [] guess_df = guess_df[['qnum', 'sentence', 'token', 'guess', 'fold']].drop_duplicates( ['qnum', 'sentence', 'token', 'guess']) for name, guesses in guess_df.groupby(['qnum', 'sentence', 'token']): qnum = name[0] question = question_map[qnum] tasks.append(Task(question, guesses)) return tasks
def generate_questions(): with open('data/100_possible_questions.pickle', 'rb') as f: qs = pickle.load(f) with open('data/qb_questions.txt', 'w') as f: for q in qs: f.write(q.flatten_text()) f.write('\n') db = QuestionDatabase() answers = db.all_answers().values() with open('data/answers.txt', 'w') as f: for a in answers: f.write(a.lower().replace(' ', '_')) f.write('\n')
def evaluate(): wiki = WikiNetworkGuesser() db = QuestionDatabase(QB_QUESTION_DB) questions = [q for q in db.all_questions().values() if q.fold == 'train'] random.shuffle(questions) subset = questions[0:10] df = None for q in subset: tmp_df = wiki.generate_guesses(q.flatten_text(), q.page.lower().replace(' ', '_'), q.qnum) if df is None: df = tmp_df else: df = pd.concat([df, tmp_df]) return df
def load_quizbowl(split_sentences=True, num_answers=-1, min_answer_freq=-1): nlp = spacy.load("en") questions = QuestionDatabase().all_questions().values() answers = [x.page for x in questions] answer_counter = collections.Counter(answers) if num_answers != -1: answer_counter = sorted(answer_counter.items(), key=lambda x: x[1])[::-1] answers = [x for x, y in answer_counter[:num_answers]] else: answers = [x for x, y in answer_counter.items() if y >= min_answer_freq] answer_to_id = {x: i for i, x in enumerate(answers)} print("# class: {}".format(len(answers))) folds = [GUESSER_TRAIN_FOLD, GUESSER_DEV_FOLD] questions = [x for x in questions if x.fold in folds and x.page in answers] train, dev = [], [] for q in tqdm(questions): text = nlp(clean_question(q.flatten_text())) answer = answer_to_id[q.page] if split_sentences: for sent in text.sents: sent = [w.lower_ for w in sent if w.is_alpha or w.is_digit] if q.fold == GUESSER_TRAIN_FOLD: train.append((sent, answer)) else: dev.append((sent, answer)) else: sent = [w.lower_ for w in text if w.is_alpha or w.is_digit] if q.fold == GUESSER_TRAIN_FOLD: train.append((sent, answer)) else: dev.append((sent, answer)) return train, dev, answers
def preprocess_titles(): # stop_words = set(stopwords.words('english')) titles_file = open('data/titles-sorted.txt') db = QuestionDatabase() pages = {format_guess(page) for page in db.questions_with_pages().keys()} with open('data/processed-titles-sorted.txt', 'w') as f: for line in titles_file: page = format_guess(line.strip().lower()) # if len(page) > 2 and re.match(r"^[a-zA-Z0-9_()']+$", page)\ # and page not in stop_words and page[0].isalnum(): if page in pages: f.write(line.strip().lower()) else: f.write('@') f.write('\n') titles_file.close()
def hyper_search(fold): option2id, all_guesses = load_quizbowl() all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) cfgs = get_cfgs() cfg_buzzes = [] for i, cfg in enumerate(cfgs): print('**********{}**********'.format(i)) buzzes = run(cfg, fold, all_guesses, option2id) cfg_buzzes.append((cfg, buzzes)) with open('output/buzzer/cfg_buzzes_{}.pkl'.format(fold), 'wb') as outfile: pickle.dump(cfg_buzzes, outfile)
def main(): buzzer = RNNBuzzer() # setup questions questions = list(QuestionDatabase().all_questions().values()) dev_questions = [x for x in questions if x.fold == 'dev'] # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) machine_agent = GuesserBuzzerAgent(guesser, buzzer) # setup human agent human_agent = HumanAgent() # setup hook hooks = [] hooks.append(hook.NotifyBuzzingHook) hooks.append(hook.GameInterfaceHook) hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent)) hooks.append(hook.HighlightHook) # setup game game = Game(dev_questions, [human_agent, machine_agent], hooks) game.run(10)
def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != "" } protobowl_df = load_protobowl().groupby("qid") save_dir = "output/summary/new_performance/" if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby("qnum") buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, "rb") as infile: buzzes = pickle.load(infile) log.info("Buzzes loaded from {}.".format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info="Top guesses", multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions ] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds)
def load_data(pred_file: str, meta_file: str, q_db: QuestionDatabase) -> Sequence: preds = load_predictions(pred_file) metas = load_meta(meta_file) answers = q_db.all_answers() def create_line(group): question = group[0] elements = group[1] st_groups = ( seq(elements).group_by(lambda x: (x[0].sentence, x[0].token)).sorted() ) st_lines = [] for st, v in st_groups: scored_guesses = ( seq(v) .map(lambda x: ScoredGuess(x[0].score, x[1].guess)) .sorted(reverse=True) .list() ) st_lines.append( Line( question, st[0], st[1], scored_guesses[0].score > 0, scored_guesses[0].guess, answers[question], scored_guesses, ) ) return question, st_lines def fix_missing_label(pm): prediction = pm[0] meta = pm[1] if ( prediction.question is None or prediction.token is None or prediction.sentence is None ): log.info( "WARNING: Prediction malformed, fixing with meta line: {0}".format( prediction ) ) prediction = Prediction( prediction.score, meta.question, meta.sentence, meta.token ) assert meta.question == prediction.question assert meta.sentence == prediction.sentence assert meta.token == prediction.token return prediction, meta return ( preds.zip(metas) .map(fix_missing_label) .group_by(lambda x: x[0].question) .map(create_line) )
def wikify(output_directory): database = QuestionDatabase(QB_QUESTION_DB) pages = database.questions_with_pages() total = 0 for p in pages: if len(pages[p]) >= conf['wikifier']['min_appearances']: log.info('{} {}'.format(p, len(pages[p]))) for q in pages[p]: total += 1 for sentence, word, text in q.partials(): sentence -= 1 with open( "%s/%i-%i.txt" % (output_directory, q.qnum, sentence), 'w') as output: output.write("%s\n" % text[sentence]) log.info(str(total))
def create_wikipedia_cache(dump_path): from qanta.spark import create_spark_session spark = create_spark_session() db = QuestionDatabase() answers = set(db.all_answers().values()) b_answers = spark.sparkContext.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH)) def create_page(row): title = normalize_wikipedia_title(row.title) filter_answers = b_answers.value if title in filter_answers: page = WikipediaPage(title, row.text, None, None, row.id, row.url) write_page(page, page_path=page_path) spark.read.json(dump_path).rdd.foreach(create_page)
def test_buzzer(): questions = QuestionDatabase().all_questions() buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip']) # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) key = list(questions.keys())[4] question = questions[key].flatten_text().split() for i, word in enumerate(question): clue = ' '.join(question[:i]) guesses = guesser.guess(clue) buzz = buzzer.buzz(guesses) print(buzz)
def run(self): make_dirs('output/tagme/') db = QuestionDatabase() questions = list(db.all_questions().values()) batch = 0 batch_lookup = {} while batch * BATCH_SIZE < len(questions): batch_questions = questions[batch * BATCH_SIZE:(batch + 1) * BATCH_SIZE] batch_lookup[batch] = batch_questions batch += 1 with open('output/tagme/batches.pickle', 'wb') as f: pickle.dump(batch_lookup, f) with open('output/tagme/meta.pickle', 'wb') as f: pickle.dump(batch, f)
def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = {k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != ''} protobowl_df = load_protobowl().groupby('qid') save_dir = 'output/summary/new_performance/' if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses( bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby('qnum') buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) log.info('Buzzes loaded from {}.'.format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds)
def __init__(self): super(StatsExtractor, self).__init__() with open(SENTENCE_STATS, 'rb') as f: self.word_count_mean, self.word_count_std = pickle.load(f) self.guess_frequencies = {} question_db = QuestionDatabase(QB_QUESTION_DB) all_questions = question_db.questions_with_pages() for page in all_questions: self.guess_frequencies[page] = sum(1 for x in all_questions[page] if x.fold == "train") self.frequency_mean = np.mean(list(self.guess_frequencies.values())) self.frequency_std = np.std(list(self.guess_frequencies.values())) for page in all_questions: normalized_frequency = normalize(self.guess_frequencies[page], self.frequency_mean, self.frequency_std) self.guess_frequencies[page] = normalized_frequency self.normed_missing_guess = normalize(0, self.frequency_mean, self.frequency_std)
def report_ultimate(): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != '' } protobowl_df, user_count = load_protobowl() guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[c.BUZZER_DEV_FOLD]) questions = guesses_df.groupby('qnum') top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} option2id, all_guesses = load_quizbowl() test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=128) buzzes = ultimate_buzzer(test_iter) save_dir = 'output/summary/new_performance/' inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir] user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000] threshold_stats = [] for threshold in user_answers_thresholds: pdf1 = protobowl_df[protobowl_df.user_answers > threshold] p_inputs = [ question_texts, protobowl_ids, pdf1.groupby('qid'), questions ] + inputs pstats = get_protobowl(p_inputs) threshold_stats.append(pstats) print('ultimate', threshold, pstats) print('ultimate', [x['reward'] for x in threshold_stats])
def report(buzzes_dir): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != '' } protobowl_df, user_count = load_protobowl() guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[c.BUZZER_DEV_FOLD]) questions = guesses_df.groupby('qnum') top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) save_dir = 'output/summary/new_performance/' inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir] user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000] threshold_stats = [] for threshold in user_answers_thresholds: pdf1 = protobowl_df[protobowl_df.user_answers > threshold] p_inputs = [ question_texts, protobowl_ids, pdf1.groupby('qid'), questions ] + inputs pstats = get_protobowl(p_inputs) threshold_stats.append(pstats) print(threshold, pstats) with open(buzzes_dir + '.pstats', 'wb') as f: pickle.dump(threshold_stats, f) print([x['reward'] for x in threshold_stats])
def main(): db = QuestionDatabase() question_lookup = db.all_questions() questions = list(question_lookup.values()) guesser_train_questions = [q for q in questions if q.fold == 'guesstrain'] guesser_train_answers = [q.page for q in guesser_train_questions] answer_counts = Counter(guesser_train_answers) answer_set = set(answer_counts.keys()) app = dash.Dash() app.layout = html.Div(children=[ html.H1(children='Quiz Bowl Question Explorer'), compute_stats(questions, db.location), html.H2('Question Inspector'), dcc.Dropdown( options=[{'label': q.qnum, 'value': q.qnum} for q in questions], value=questions[0].qnum, id='question-selector' ), html.Div([ html.Div(id='question-display') ]), dcc.Graph( id='answer-count-plot', figure=go.Figure( data=[go.Histogram(x=list(answer_counts.values()), name='Answer Counts')], layout=go.Layout( title='Answer Count Distribution', showlegend=True ) ) ), dcc.Graph( id='answer-count-cum-plot', figure=go.Figure( data=[go.Histogram( x=list(answer_counts.values()), name='Answer Counts Cumulative', cumulative=dict(enabled=True, direction='decreasing'), histnorm='percent' )], layout=go.Layout( title='Answer Count Cumulative Distribution', showlegend=True ) ) ), html.Label('Answer Selection'), dcc.Dropdown( options=sorted([{'label': a, 'value': a} for a in answer_set], key=lambda k: k['label']), id='answer-list' ), html.Div(id='answer-count') ]) @app.callback( Output(component_id='answer-count', component_property='children'), [Input(component_id='answer-list', component_property='value')] ) def update_answer_count(answer): return f'Answer: {answer} Question Count: {answer_counts[answer]}' @app.callback( Output(component_id='question-display', component_property='children'), [Input(component_id='question-selector', component_property='value')] ) def update_question(qb_id): qb_id = int(qb_id) question = question_lookup[qb_id] sentences, answer, _ = question.to_example() return [ html.P(f'ID: {qb_id} Fold: {question.fold}'), html.H3('Sentences') ] + [html.P(f'{i}: {sent}') for i, sent in enumerate(sentences)] + [ html.H3('Answer'), html.P(answer) ] app.css.append_css({"external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"}) app.run_server(debug=True)