def merge_dfs(): GUESSERS = ["{0}.{1}".format( x.guesser_module, x.guesser_class) \ for x in AbstractGuesser.list_enabled_guessers()] log.info("Merging guesser DataFrames.") merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged') if not os.path.exists(merged_dir): os.makedirs(merged_dir) for fold in c.BUZZER_INPUT_FOLDS: if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)): log.info("Merged {0} exists, skipping.".format(fold)) continue new_guesses = pd.DataFrame(columns=[ 'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token' ], dtype='object') for guesser in GUESSERS: guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser) guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold]) new_guesses = new_guesses.append(guesses) for col in ['qnum', 'sentence', 'token', 'score']: new_guesses[col] = pd.to_numeric(new_guesses[col], downcast='integer') AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold]) log.info("Merging: {0} finished.".format(fold))
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \ -> Tuple[Dict[str, int], Dict[str, list]]: # merge_dfs() log.info('Loading data') question_db = QuestionDatabase() quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS, guesser_train=True, buzzer_train=True) all_questions = question_db.all_questions() if not os.path.isfile(bc.OPTIONS_DIR): log.info('Loading the set of options') all_options = set(quizbowl_db.training_data()[1]) id2option = list(all_options) with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile: pickle.dump(id2option, outfile) else: with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile: id2option = pickle.load(infile) option2id = {o: i for i, o in enumerate(id2option)} num_options = len(id2option) log.info('Number of options {0}'.format(len(id2option))) guesses_by_fold = dict() for fold in folds: save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold)) if os.path.isfile(save_dir): with open(safe_path(save_dir), 'rb') as infile: guesses_by_fold[fold] = pickle.load(infile) log.info('Loading {0} guesses'.format(fold)) continue log.info('Processing {0} guesses'.format(fold)) guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) worker = partial(_process_question, option2id, all_questions) inputs = guesses.groupby('qnum') guesses_by_fold[fold] = _multiprocess(worker, inputs, info='df data', multi=True) guesses_by_fold[fold] = [ x for x in guesses_by_fold[fold] if x is not None ] print(len(guesses_by_fold[fold])) with open(safe_path(save_dir), 'wb') as outfile: pickle.dump(guesses_by_fold[fold], outfile) log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir)) return option2id, guesses_by_fold
def hyper_search(fold): option2id, all_guesses = load_quizbowl() all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) cfgs = get_cfgs() cfg_buzzes = [] for i, cfg in enumerate(cfgs): print('**********{}**********'.format(i)) buzzes = run(cfg, fold, all_guesses, option2id) cfg_buzzes.append((cfg, buzzes)) with open('output/buzzer/cfg_buzzes_{}.pkl'.format(fold), 'wb') as outfile: pickle.dump(cfg_buzzes, outfile)
def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != "" } protobowl_df = load_protobowl().groupby("qid") save_dir = "output/summary/new_performance/" if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby("qnum") buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, "rb") as infile: buzzes = pickle.load(infile) log.info("Buzzes loaded from {}.".format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info="Top guesses", multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions ] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds)
def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = {k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != ''} protobowl_df = load_protobowl().groupby('qid') save_dir = 'output/summary/new_performance/' if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses( bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby('qnum') buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) log.info('Buzzes loaded from {}.'.format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds)
def generate(config, folds): N_GUESSERS = len(GUESSERS) option2id, all_guesses = load_quizbowl(folds) cfg = getattr(configs, config)() # cfg = pickle.load(open(cfg.ckp_dir, 'rb')) make_vector = getattr(iterator, cfg.make_vector) iterators = dict() for fold in folds: iterators[fold] = QuestionIterator(all_guesses[fold], option2id, batch_size=cfg.batch_size, make_vector=make_vector) if not os.path.exists(cfg.model_dir): log.info('Model {0} not available'.format(cfg.model_dir)) exit(0) model = RNN(iterators[folds[0]].n_input, cfg.n_hidden, N_GUESSERS + 1) log.info('Loading model {0}'.format(cfg.model_dir)) chainer.serializers.load_npz(cfg.model_dir, model) gpu = conf['buzzer']['gpu'] if gpu != -1 and chainer.cuda.available: log.info('Using gpu {0}'.format(gpu)) chainer.cuda.get_device(gpu).use() model.to_gpu(gpu) trainer = Trainer(model, cfg.model_dir) for fold in folds: buzzes = trainer.test(iterators[fold]) log.info('{0} buzzes generated. Size {1}.'.format(fold, len(buzzes))) buzzes_dir = bc.BUZZES_DIR.format(fold, cfg.model_name) with open(buzzes_dir, 'wb') as f: pickle.dump(buzzes, f) log.info('Buzzes saved to {0}.'.format(buzzes_dir)) if fold == 'expo': guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) buzzer2vwexpo(guesses_df, buzzes, fold)
def report_ultimate(): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != '' } protobowl_df, user_count = load_protobowl() guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[c.BUZZER_DEV_FOLD]) questions = guesses_df.groupby('qnum') top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} option2id, all_guesses = load_quizbowl() test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=128) buzzes = ultimate_buzzer(test_iter) save_dir = 'output/summary/new_performance/' inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir] user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000] threshold_stats = [] for threshold in user_answers_thresholds: pdf1 = protobowl_df[protobowl_df.user_answers > threshold] p_inputs = [ question_texts, protobowl_ids, pdf1.groupby('qid'), questions ] + inputs pstats = get_protobowl(p_inputs) threshold_stats.append(pstats) print('ultimate', threshold, pstats) print('ultimate', [x['reward'] for x in threshold_stats])
def report(buzzes_dir): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = { k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != '' } protobowl_df, user_count = load_protobowl() guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[c.BUZZER_DEV_FOLD]) questions = guesses_df.groupby('qnum') top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) save_dir = 'output/summary/new_performance/' inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir] user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000] threshold_stats = [] for threshold in user_answers_thresholds: pdf1 = protobowl_df[protobowl_df.user_answers > threshold] p_inputs = [ question_texts, protobowl_ids, pdf1.groupby('qid'), questions ] + inputs pstats = get_protobowl(p_inputs) threshold_stats.append(pstats) print(threshold, pstats) with open(buzzes_dir + '.pstats', 'wb') as f: pickle.dump(threshold_stats, f) print([x['reward'] for x in threshold_stats])
log.info('\n\n[buzzer2vwexpo] writing to files') buzz_template = '|'.join(['{}' for _ in range(7)]) buzz_out = '\n'.join(buzz_template.format(*r) for r in itertools.chain(*buzzf)) buzz_file.write(buzz_out) log.info('buzz file written') final_out = '\n'.join('{0},{1}'.format(*r) for r in itertools.chain(*finalf)) final_file.write(final_out) log.info('final file written') pred_out = '\n'.join('{0} {1}_{2}_{3}'.format(*r) for r in itertools.chain(*predf)) pred_file.write(pred_out) log.info('vw_pred file written') meta_out = '\n'.join('{0} {1} {2} {3}'.format(*r) for r in itertools.chain(*metaf)) meta_file.write(meta_out) log.info('vw_meta file written') if __name__ == '__main__': model_name = 'neo_0' guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=['expo']) expo_buzzes_dir = 'output/buzzer/neo/expo_buzzes.{}.pkl'.format(model_name) with open(expo_buzzes_dir, 'rb') as f: expo_buzzes = pickle.load(f) buzzer2vwexpo(guesses_df, expo_buzzes, 'expo')