def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold)) with open(buzzes_dir, 'rb') as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {'Position': [], 'Buzzing': []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby('char_index') gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess'] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only optimal') if only_buzzer: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only buzzer') if both: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Both') if neither: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Neither') df = pd.DataFrame(stack_freq) df = df.groupby(['Position', 'Buzzing']) df = df.size().reset_index().rename(columns={0: 'Frequency'}) df['Frequency'] = df.apply( lambda row: row['Frequency'] / count[row['Position']], axis=1) df['Model'] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold)) with open(stack_dir, 'wb') as f: pickle.dump(df, f) return df
import pickle from qanta.guesser.abstract import AbstractGuesser from qanta.guesser.dan import DanGuesser from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD guesser_directory = AbstractGuesser.output_path( 'qanta.guesser.dan', 'DanGuesser', 0, '') guesser = DanGuesser.load(guesser_directory) # type: AbstractGuesser guesser.batch_size /= 8 word_skip = 2 folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD] for fold in folds: df = guesser.generate_guesses(1, [fold], word_skip=word_skip) output_path = AbstractGuesser.guess_path(guesser_directory, fold) with open(output_path, 'wb') as f: pickle.dump(df, f)
def output(self): return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_report.pdf')), LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_report.pickle')) ]
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) reporting_directory = AbstractGuesser.reporting_path( self.guesser_module, self.guesser_class, self.config_num, "") # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage. # Since we only care about the results we can get them, then delete the models. We can use the regular # GuesserReport to preserve the model guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "") param_path = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_params.pickle", ) guesses_files = [] if os.path.exists(c.QANTA_EXPO_DATASET_PATH): folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD] else: folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD] for f in folds: guesses_files.extend([ f"guesses_char_{f}.pickle", f"guesses_full_{f}.pickle", f"guesses_first_{f}.pickle", ]) guesses_paths = [ AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, f) for f in guesses_files ] log.info(f'Running: "cp {param_path} {reporting_directory}"') shell(f"cp {param_path} {reporting_directory}") for g_path in guesses_paths: log.info(f'Running: "cp {g_path} {reporting_directory}"') shell(f"cp {g_path} {reporting_directory}") guesser_instance = guesser_class(self.config_num) for f in folds: guesser_instance.create_report(reporting_directory, f) log.info(f'Running: "rm -rf {guesser_directory}"') shell(f"rm -rf {guesser_directory}") for g_path in guesses_paths: shell(f"rm -f {g_path}")
def read_data(fold, output_type='char', guesser_module='qanta.guesser.dan', guesser_class='DanGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path(guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: return pickle.dump(dataset, f) return dataset
def main(): buzzer = RNNBuzzer() # setup questions questions = list(QuestionDatabase().all_questions().values()) dev_questions = [x for x in questions if x.fold == 'dev'] # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) machine_agent = GuesserBuzzerAgent(guesser, buzzer) # setup human agent human_agent = HumanAgent() # setup hook hooks = [] hooks.append(hook.NotifyBuzzingHook) hooks.append(hook.GameInterfaceHook) hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent)) hooks.append(hook.HighlightHook) # setup game game = Game(dev_questions, [human_agent, machine_agent], hooks) game.run(10)
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) """eval""" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby("qid") worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,}) result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold)) with open(result_dir, "wb") as f: pickle.dump(result_df, f)
def test(): gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) torch.cuda.set_device(0) predictor = Predictor() predictor.cuda() dataset = BonusPairsDataset() examples = [x for x in dataset.examples if x['start'] != -1] guesses = [] for example in tqdm(examples): document = example['content'] question = example['query'] answer = example['answer'] predictions = predictor.predict(document, question, top_n=1) prediction = predictions[0][0] gs = guesser.guess_single(example['query']) gs = sorted(gs.items(), key=lambda x: x[1])[::-1] guess = gs[0][0].replace('_', ' ') guesses.append((prediction, guess, example['answer'])) with open('results.pkl', 'wb') as f: pickle.dump(guesses, f)
def output(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_targets = [ LocalTarget(file) for file in guesser_class.files( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')) ] return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')), LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_params.pickle')) ] + guesser_targets
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_instance = guesser_class() # type: AbstractGuesser qb_dataset = guesser_instance.qb_dataset() start_time = time.time() guesser_instance.train(qb_dataset.training_data()) end_time = time.time() guesser_instance.save( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')) params = guesser_instance.parameters() params['training_time'] = end_time - start_time params_path = AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_params.pickle') with open(params_path, 'wb') as f: pickle.dump(params, f)
def output(self): return LocalTarget( AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_report_{self.fold}.pickle", ))
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "") guesser_instance = guesser_class.load( guesser_directory) # type: AbstractGuesser if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}: char_skip = conf["guesser_char_skip"] elif self.fold == c.EXPO_FOLD: char_skip = conf["expo_char_skip"] else: char_skip = conf["buzzer_char_skip"] log.info( f"Generating and saving guesses for {self.fold} fold with char_skip={char_skip}..." ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], char_skip=char_skip) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "char") log.info("Done saving guesses") log.info( f"Generating and saving guesses for {self.fold} fold with full question..." ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], full_question=True) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "full") log.info("Done saving guesses") log.info( f"Generating and saving guesses for {self.fold} fold with first sentence" ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], first_sentence=True) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "first") log.info("Done saving guesses")
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, '') guesser_instance = guesser_class.load( guesser_directory) # type: AbstractGuesser if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}: char_skip = conf['guesser_char_skip'] else: char_skip = conf['buzzer_char_skip'] log.info( f'Generating and saving guesses for {self.fold} fold with char_skip={char_skip}...' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], char_skip=char_skip) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'char') log.info('Done saving guesses') log.info( f'Generating and saving guesses for {self.fold} fold with full question...' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], full_question=True) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'full') log.info('Done saving guesses') log.info( f'Generating and saving guesses for {self.fold} fold with first sentence' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], first_sentence=True) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'first') log.info('Done saving guesses')
def output(self): files = [ f'guesses_char_{self.fold}.pickle', f'guesses_full_{self.fold}.pickle', f'guesses_first_{self.fold}.pickle' ] return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, f)) for f in files ]
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_instance = guesser_class( self.config_num) # type: AbstractGuesser qb_dataset = guesser_instance.qb_dataset() start_time = time.time() guesser_instance.train(qb_dataset.training_data()) end_time = time.time() guesser_instance.save( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, "")) params = guesser_instance.parameters() params["training_time"] = end_time - start_time params_path = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "guesser_params.pickle", ) with open(params_path, "wb") as f: pickle.dump(params, f)
def test_buzzer(): questions = QuestionDatabase().all_questions() buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip']) # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) key = list(questions.keys())[4] question = questions[key].flatten_text().split() for i, word in enumerate(question): clue = ' '.join(question[:i]) guesses = guesser.guess(clue) buzz = buzzer.buzz(guesses) print(buzz)
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby("char_index") for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)["guess"] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append( {"char_index": char_index, "guess": guess, "buzz": buzz,} ) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {"text": question.text, "page": question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { "expected_wins": sum(ew), "n_examples": len(ew), "expected_wins_optimal": sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, '') guesser_instance = guesser_class.load( guesser_directory) # type: AbstractGuesser if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}: word_skip = conf['guesser_word_skip'] else: word_skip = conf['buzzer_word_skip'] log.info( 'Generating and saving guesses for {} fold with word_skip={}...'. format(self.fold, word_skip)) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], word_skip=word_skip) end_time = time.time() log.info('Guessing on {} fold took {}s, saving guesses...'.format( self.fold, end_time - start_time)) guesser_class.save_guesses(guess_df, guesser_directory, [self.fold]) log.info('Done saving guesses')
import pickle from qanta.guesser.abstract import AbstractGuesser from qanta.guesser.dan import DanGuesser from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD guesser_directory = AbstractGuesser.output_path( "qanta.guesser.dan", "DanGuesser", 0, "" ) guesser = DanGuesser.load(guesser_directory) # type: AbstractGuesser guesser.batch_size /= 8 word_skip = 2 folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD] for fold in folds: df = guesser.generate_guesses(1, [fold], word_skip=word_skip) output_path = AbstractGuesser.guess_path(guesser_directory, fold) with open(output_path, "wb") as f: pickle.dump(df, f)
def output(self): return LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesses_{}.pickle'.format(self.fold)))
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold)) with open(buzzes_dir, "rb") as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {"Position": [], "Buzzing": []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby("char_index") gs = gs.aggregate(lambda x: x.head(1)).to_dict()["guess"] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Only optimal") if only_buzzer: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Only buzzer") if both: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Both") if neither: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Neither") df = pd.DataFrame(stack_freq) df = df.groupby(["Position", "Buzzing"]) df = df.size().reset_index().rename(columns={0: "Frequency"}) df["Frequency"] = df.apply( lambda row: row["Frequency"] / count[row["Position"]], axis=1 ) df["Model"] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, "{}_stack.pkl".format(fold)) with open(stack_dir, "wb") as f: pickle.dump(df, f) return df
from tqdm import tqdm from elasticsearch_dsl.connections import connections from elasticsearch_dsl import DocType, Text, Keyword, Search, Index from qanta.util.constants import GUESSER_DEV_FOLD from qanta.guesser.abstract import AbstractGuesser from qanta.datasets.quiz_bowl import QuizBowlDataset from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchIndex INDEX_NAME = 'qb_ir_instance_of' gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) es_index = ElasticSearchIndex() def recursive_guess(question, k=0): p_class, p_prob = guesser.test_instance_of([question])[0] first_guesses = search_not(question, p_class) print('First round') for x in first_guesses: print(x) print() print('Second round') new_guesses = [] for i in range(k): guess = first_guesses[i][0] question += ' ' + ' '.join(guess.split('_')) guesses = es_index.search(question, p_class, p_prob, 0.6)
def export(output_file: str, fold: str = "buzztest"): fold = "buzztest" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} buzzers = {} for name in ["RNNBuzzer", "ThresholdBuzzer", "MLPBuzzer"]: model_dir = f"output/buzzer/{name}" buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold)) with open(buzzes_dir, "rb") as f: buzzers[name] = pickle.load(f) qid_to_buzzes = defaultdict(dict) for name, buzzes in track(buzzers.items()): for qid, (char_indices, scores) in buzzes.items(): gs = (guesses.get_group(qid).groupby("char_index").aggregate( lambda x: x.head(1)).to_dict()["guess"]) question = questions[qid] q_len = len(question.text) buzz_oracle_position = -1 buzz_model_position = -1 oracle_guess = None buzz_guess = None for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page if buzz_oracle: if buzz_oracle_position == -1 or char_index <= buzz_oracle_position: oracle_guess = question.page buzz_oracle_position = char_index if scores[i][1] > scores[i][0]: if buzz_model_position == -1 or char_index < buzz_model_position: buzz_guess = gs[char_index] buzz_model_position = char_index qid_to_buzzes[qid][name] = { "oracle": buzz_oracle_position, "oracle_fraction": buzz_oracle_position / q_len if buzz_oracle_position != -1 else -1, "position": buzz_model_position, "position_fraction": buzz_model_position / q_len if buzz_model_position != -1 else -1, "q_len": q_len, "oracle_guess": oracle_guess, "buzz_guess": buzz_guess, "answer": question.page, "impossible": oracle_guess is None, } write_json(output_file, qid_to_buzzes)
import random import pickle from qanta.config import conf from qanta.util.io import safe_path from qanta.util.multiprocess import _multiprocess from qanta.guesser.abstract import AbstractGuesser from qanta.datasets.quiz_bowl import QuizBowlDataset, Question from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser '''Randomly shuffle the word order and see if it changes the guesses. ''' gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) def main(): fold = 'guessdev' db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True) questions = db.questions_in_folds([fold]) first_n = lambda x: len(x) print(guesser.guess_single(' '.join(questions[0].text.values()))) ''' s = [0, 0, 0, 0, 0] for q in questions: sents = list(q.text.values()) text_before = ' '.join(sents[:first_n(sents)]) words = text.split()
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, '') guesser_instance = guesser_class(self.config_num) guesser_instance.create_report(guesser_directory, self.fold)
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold)) with open(buzzes_dir, 'rb') as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {'Position': [], 'Buzzing': []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby('char_index') gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess'] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only optimal') if only_buzzer: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only buzzer') if both: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Both') if neither: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Neither') df = pd.DataFrame(stack_freq) df = df.groupby(['Position', 'Buzzing']) df = df.size().reset_index().rename(columns={0: 'Frequency'}) df['Frequency'] = df.apply( lambda row: row['Frequency'] / count[row['Position']], axis=1) df['Model'] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold)) with open(stack_dir, 'wb') as f: pickle.dump(df, f) return df
def read_data( fold, output_type='char', guesser_module='qanta.guesser.rnn', guesser_class='RnnGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path( guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: pickle.dump(dataset, f) return dataset
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join( model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)