Exemplo n.º 1
0
def main(folds, model_name):

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ""
    }
    protobowl_df = load_protobowl().groupby("qid")

    save_dir = "output/summary/new_performance/"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby("qnum")

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, "rb") as infile:
            buzzes = pickle.load(infile)
        log.info("Buzzes loaded from {}.".format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses,
                                    questions,
                                    info="Top guesses",
                                    multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)

        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions
                    ] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Exemplo n.º 2
0
def main():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = {k: v  for k, v in questions.items() 
            if v.fold == 'guessdev'}
    highlights = {}
    for k, v in tqdm(guessdev_questions.items()):
        highlights[k] = get_highlights(v.flatten_text())
    with open('guessdev_highlight.pkl', 'wb') as f:
        pickle.dump(highlights, f)
Exemplo n.º 3
0
def main(folds, model_name):
    
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {k: all_questions[k].protobowl 
        for k in all_questions if all_questions[k].protobowl != ''}
    protobowl_df = load_protobowl().groupby('qid')

    save_dir = 'output/summary/new_performance/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(
                bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby('qnum')

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses, questions, 
            info='Top guesses', multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)
        
        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Exemplo n.º 4
0
def main():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = {
        k: v
        for k, v in questions.items() if v.fold == 'guessdev'
    }
    highlights = {}
    for k, v in tqdm(guessdev_questions.items()):
        highlights[k] = get_highlights(v.flatten_text())
    with open('guessdev_highlight.pkl', 'wb') as f:
        pickle.dump(highlights, f)
Exemplo n.º 5
0
def report_ultimate():
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    option2id, all_guesses = load_quizbowl()
    test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD],
                                 option2id,
                                 batch_size=128)
    buzzes = ultimate_buzzer(test_iter)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print('ultimate', threshold, pstats)
    print('ultimate', [x['reward'] for x in threshold_stats])
Exemplo n.º 6
0
def report(buzzes_dir):
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    with open(buzzes_dir, 'rb') as infile:
        buzzes = pickle.load(infile)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print(threshold, pstats)
    with open(buzzes_dir + '.pstats', 'wb') as f:
        pickle.dump(threshold_stats, f)
    print([x['reward'] for x in threshold_stats])
Exemplo n.º 7
0
def hyper_search(fold):
    option2id, all_guesses = load_quizbowl()

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

    cfgs = get_cfgs()
    cfg_buzzes = []
    for i, cfg in enumerate(cfgs):
        print('**********{}**********'.format(i))
        buzzes = run(cfg, fold, all_guesses, option2id)
        cfg_buzzes.append((cfg, buzzes))

    with open('output/buzzer/cfg_buzzes_{}.pkl'.format(fold), 'wb') as outfile:
        pickle.dump(cfg_buzzes, outfile)
Exemplo n.º 8
0
def test():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = [
        v for k, v in questions.items() if v.fold == "guessdev"
    ]
    q = guessdev_questions[1]
    second_best_words = get_second_best_wiki_words(q)
    guesses = guesser.guess_single(q.flatten_text())
    guesses = sorted(guesses.items(), key=lambda x: x[1])
    guess_before = guesses[-1]
    print(q.flatten_text())
    print(guess_before)
    print()

    text_after = q.flatten_text()
    if second_best_words["wiki"] is not None:
        text_after += " ".join(second_best_words["wiki"])
    if second_best_words["qb"] is not None:
        text_after += " ".join(second_best_words["qb"])
    guesses = guesser.guess_single(text_after)
    guesses = sorted(guesses.items(), key=lambda x: x[1])
    guess_after = guesses[-1]
    print(text_after)
    print(guess_after)
Exemplo n.º 9
0
def main():
    questions = QuestionDatabase().all_questions()
    guessdev_questions = {
        k: v
        for k, v in questions.items() if v.fold == "guessdev"
    }
Exemplo n.º 10
0
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--fold', default=None)
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    if args.fold != None:
        folds = [args.fold]
    else:
        folds = c.BUZZ_FOLDS

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}

    variables = dict()
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        buzzes_dir = bc.BUZZES_DIR.format(fold)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        checkpoint_dir = "output/summary/performance_{}.pkl".format(fold)
        plot_dir = "output/summary/performance_{}_his.png".format(fold)
        eop_output, his_output = generate(buzzes, answers, guesses_df, fold,
                                          checkpoint_dir, plot_dir)
        variables['eop_{}_output'.format(fold)] = eop_output