def run(self): safe_path(WIKI_DISAMBIGUATION_PAGES) if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/disambiguation_pages.json' shell('aws s3 cp {} {}'.format(s3_location, WIKI_DISAMBIGUATION_PAGES)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json' shell('wget -O {} {}'.format(WIKI_DISAMBIGUATION_PAGES, https_location))
def run(self): safe_path(WIKI_DISAMBIGUATION_PAGES) if is_aws_authenticated(): s3_location = "s3://pinafore-us-west-2/public/disambiguation_pages.json" shell("aws s3 cp {} {}".format(s3_location, WIKI_DISAMBIGUATION_PAGES)) else: https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json" shell("wget -O {} {}".format(WIKI_DISAMBIGUATION_PAGES, https_location))
def run(self): safe_path(ALL_WIKI_REDIRECTS) if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv' shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv' shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
def requires(self): yield Download( url=path.join(S3_HTTP_PREFIX, PROTOBOWL_TOSSUPS), path=safe_path(PROTOBOWL_TOSSUPS_PATH), ) yield Download( url=path.join(S3_HTTP_PREFIX, PROTOBOWL_LOGS), path=safe_path(PROTOBOWL_LOGS_PATH), )
def requires(self): yield Download( url=path.join(S3_HTTP_PREFIX, PROTOBOWL_TOSSUPS), path=safe_path(PROTOBOWL_TOSSUPS_PATH) ) yield Download( url=path.join(S3_HTTP_PREFIX, PROTOBOWL_LOGS), path=safe_path(PROTOBOWL_LOGS_PATH) )
def requires(self): yield Download(url=path.join(S3_HTTP_PREFIX, QDB_CATEGORIES), path=safe_path(QDB_CATEGORIES_PATH)) yield Download(url=path.join(S3_HTTP_PREFIX, QDB_SUBCATEGORIES), path=safe_path(QDB_SUBCATEGORIES_PATH)) yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOURNAMENTS), path=safe_path(QDB_TOURNAMENTS_PATH)) yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOSSUPS), path=safe_path(QDB_TOSSUPS_PATH))
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \ -> Tuple[Dict[str, int], Dict[str, list]]: # merge_dfs() log.info('Loading data') question_db = QuestionDatabase() quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS, guesser_train=True, buzzer_train=True) all_questions = question_db.all_questions() if not os.path.isfile(bc.OPTIONS_DIR): log.info('Loading the set of options') all_options = set(quizbowl_db.training_data()[1]) id2option = list(all_options) with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile: pickle.dump(id2option, outfile) else: with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile: id2option = pickle.load(infile) option2id = {o: i for i, o in enumerate(id2option)} num_options = len(id2option) log.info('Number of options {0}'.format(len(id2option))) guesses_by_fold = dict() for fold in folds: save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold)) if os.path.isfile(save_dir): with open(safe_path(save_dir), 'rb') as infile: guesses_by_fold[fold] = pickle.load(infile) log.info('Loading {0} guesses'.format(fold)) continue log.info('Processing {0} guesses'.format(fold)) guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) worker = partial(_process_question, option2id, all_questions) inputs = guesses.groupby('qnum') guesses_by_fold[fold] = _multiprocess(worker, inputs, info='df data', multi=True) guesses_by_fold[fold] = [ x for x in guesses_by_fold[fold] if x is not None ] print(len(guesses_by_fold[fold])) with open(safe_path(save_dir), 'wb') as outfile: pickle.dump(guesses_by_fold[fold], outfile) log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir)) return option2id, guesses_by_fold
def create_es_config(output_path, host="localhost", port=9200, tmp_dir=None): if tmp_dir is None: tmp_dir = get_tmp_dir() data_dir = safe_path(os.path.join(tmp_dir, "elasticsearch/data/")) log_dir = safe_path(os.path.join(tmp_dir, "elasticsearch/log/")) env = Environment(loader=PackageLoader("qanta", "templates")) template = env.get_template("elasticsearch.yml") config_content = template.render({ "host": host, "port": port, "log_dir": log_dir, "data_dir": data_dir }) with open(output_path, "w") as f: f.write(config_content)
def create_es_config(output_path, host='localhost', port=9200, tmp_dir=None): if tmp_dir is None: tmp_dir = get_tmp_dir() data_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/data/')) log_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/log/')) env = Environment(loader=PackageLoader('qanta', 'templates')) template = env.get_template('elasticsearch.yml') config_content = template.render({ 'host': host, 'port': port, 'log_dir': log_dir, 'data_dir': data_dir }) with open(output_path, 'w') as f: f.write(config_content)
def buzzer2vwexpo(guesses_df: pd.DataFrame, buzzes: Dict[int, List[List[float]]], fold: str) -> None: # TODO: Will be deprecated after VW stuff is remove from the pipeline '''Given buzzing positions, generate vw_pred, vw_meta, buzz and final files guesses_df: pd.DataFrame of guesses buzzes: dictionary of qnum -> buzzing position fold: string indicating the data fold ''' warnings.warn( "buzzer2vwexpo will be deprecated after VW stuff is completely removed from the pipeline", DeprecationWarning) inputs = guesses_df.groupby('qnum') worker = partial(_buzzer2vwexpo, buzzes) result = _multiprocess(worker, inputs, info='buzzer2vwexpo') result = [x for x in result if x is not None] buzzf, predf, metaf, finalf = list(map(list, zip(*result))) with codecs.open(safe_path(c.PRED_TARGET.format(fold)), 'w', 'utf-8') as pred_file, \ codecs.open(safe_path(c.META_TARGET.format(fold)), 'w', 'utf-8') as meta_file, \ codecs.open(safe_path(c.EXPO_BUZZ.format(fold)), 'w', 'utf-8') as buzz_file, \ codecs.open(safe_path(c.EXPO_FINAL.format(fold)), 'w', 'utf-8') as final_file: buzz_file.write('question|sentence|word|page|evidence|final|weight\n') final_file.write('question,answer\n') log.info('\n\n[buzzer2vwexpo] writing to files') buzz_template = '|'.join(['{}' for _ in range(7)]) buzz_out = '\n'.join( buzz_template.format(*r) for r in itertools.chain(*buzzf)) buzz_file.write(buzz_out) log.info('buzz file written') final_out = '\n'.join('{0},{1}'.format(*r) for r in itertools.chain(*finalf)) final_file.write(final_out) log.info('final file written') pred_out = '\n'.join('{0} {1}_{2}_{3}'.format(*r) for r in itertools.chain(*predf)) pred_file.write(pred_out) log.info('vw_pred file written') meta_out = '\n'.join('{0} {1} {2} {3}'.format(*r) for r in itertools.chain(*metaf)) meta_file.write(meta_out) log.info('vw_meta file written')
def audit_report(df, output): df.to_csv(output) df.head(25).plot.bar('feature', 'value') plt.title('Feature Magnitudes') plt.xlabel('Magnitude') plt.savefig('/tmp/feature_importance.png', dpi=200, format='png') pd.set_option('display.width', 1000) pd.set_option('display.max_rows', 100) pd.set_option('display.max_colwidth', 30) top_features = str(df.head(100)) report = ReportGenerator( { 'feature_importance_plot': '/tmp/feature_importance.png', 'top_features': top_features }, 'audit_regressor.md') output = safe_path(VW_AUDIT_REGRESSOR_REPORT) report.create(output) plt.clf() plt.cla() plt.close()
def load(cls, directory: str) -> AbstractGuesser: guesser = DANGuesser() embeddings, embedding_lookup = _load_embeddings(root_directory=directory) guesser.embeddings = embeddings guesser.embedding_lookup = embedding_lookup params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET) with open(params_path, 'rb') as f: params = pickle.load(f) guesser.max_len = params['max_len'] guesser.class_to_i = params['class_to_i'] guesser.i_to_class = params['i_to_class'] guesser.vocab = params['vocab'] guesser.n_classes = params['n_classes'] if (guesser.max_len is None or guesser.class_to_i is None or guesser.i_to_class is None or guesser.vocab is None or guesser.n_classes is None): raise ValueError('Attempting to load uninitialized model parameters') model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET) shell('cp -r {} {}'.format(model_path, safe_path(DEEP_DAN_MODEL_TMP_DIR))) we_path = os.path.join(directory, TF_DAN_WE) shutil.copyfile(TF_DAN_WE_TMP, we_path) return guesser
def generate(min_count, pred_file, meta_file, output): database = QuestionDatabase() data = load_data(pred_file, meta_file, database) dan_answers = set(database.page_by_count(min_count, True)) answers = compute_answers(data, dan_answers) stats = compute_statistics(answers).cache() stats.to_json(safe_path(output), root_array=False) pprint.pprint(stats)
def train(self, training_data: TrainingData) -> None: log.info('Preprocessing training data...') x_train, y_train, x_test, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data, create_runs=self.train_on_q_runs, full_question=self.train_on_full_q) if self.wiki_data_frac > 0: log.info('Using wikipedia with fraction: {}'.format(self.wiki_data_frac)) wiki_data = FilteredWikipediaDataset().training_data() results = preprocess_dataset( wiki_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class) x_train.extend(results[0]) y_train.extend(results[1]) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab log.info('Creating embeddings...') embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=self.expand_we, mask_zero=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup log.info('Converting dataset to embeddings...') x_train = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train] x_test = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test] self.n_classes = nn.compute_n_classes(training_data[1]) self.max_len = nn.compute_max_len(training_data) x_train = np.array(nn.tf_format(x_train, self.max_len, 0)) x_test = np.array(nn.tf_format(x_test, self.max_len, 0)) log.info('Building keras model...') self.model = self.build_model() log.info('Training model...') callbacks = [ TensorBoard(), EarlyStopping(patience=self.max_patience, monitor='val_sparse_categorical_accuracy'), ModelCheckpoint( safe_path(CNN_MODEL_TMP_TARGET), save_best_only=True, monitor='val_sparse_categorical_accuracy' ) ] if self.decay_lr_on_plateau: callbacks.append(ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=.5, patience=5)) history = self.model.fit( x_train, y_train, validation_data=(x_test, y_test), batch_size=self.batch_size, epochs=self.max_n_epochs, callbacks=callbacks, verbose=2 ) self.history = history.history log.info('Done training')
def run(self): db = QuestionDatabase(QB_QUESTION_DB) data = load_data(PRED_TARGET.format(self.fold), META_TARGET.format(self.fold), db) audit_data = load_audit(VW_AUDIT.format(self.fold), META_TARGET.format(self.fold)) buzz_file = open(safe_path(EXPO_BUZZ.format(self.fold)), 'w', newline='') buzz_file.write('question,sentence,word,page,evidence,final,weight\n') buzz_writer = csv.writer(buzz_file, delimiter=',') final_file = open(safe_path(EXPO_FINAL.format(self.fold)), 'w', newline='') final_file.write('question,answer\n') final_writer = csv.writer(final_file, delimiter=',') for qnum, lines in data: final_sentence, final_token, final_guess = find_final(lines) if final_sentence == -1 and final_token == -1: final_writer.writerow([qnum, final_guess]) for l in lines: i = 0 is_final = False if l.sentence == final_sentence and l.token == final_token: final_writer.writerow([qnum, l.guess]) is_final = True for g in l.all_guesses: evidence = audit_data[(l.question, l.sentence, l.token, g.guess)] buzz_writer.writerow([ l.question, l.sentence, l.token, g.guess, evidence, int(is_final and g.guess == l.guess), g.score ]) i += 1 if i > 4: break buzz_file.close() final_file.close()
def save(self, directory: str) -> None: params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET) with safe_open(params_path, 'wb') as f: if (self.max_len is None or self.class_to_i is None or self.i_to_class is None or self.vocab is None or self.n_classes is None): raise ValueError('Attempting to save uninitialized model parameters') pickle.dump({ 'max_len': self.max_len, 'class_to_i': self.class_to_i, 'i_to_class': self.i_to_class, 'vocab': self.vocab, 'n_classes': self.n_classes }, f) model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET) shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path))) we_path = os.path.join(directory, TF_DAN_WE) shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
def run(self): protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH) quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH) quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH) quizdb_subcategories = QuizdbOrg.parse_subcategories(QDB_SUBCATEGORIES_PATH) quizdb_questions = QuizdbOrg.parse_tossups( quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH ) qanta_questions = merge_datasets(protobowl_questions, quizdb_questions) with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): archive = safe_path("data/external/wikipedia/parsed-wiki.tar.lz4") if is_aws_authenticated(): s3_location = f"s3://pinafore-us-west-2/public/parsed-wiki.tar.lz4" shell(f"aws s3 cp {s3_location} {archive}") else: https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4" shell(f"wget -O {archive} {https_location}") shell(f"lz4 -d {archive} | tar -x -C data/external/wikipedia/") shell(f"rm {archive}") shell("touch data/external/wikipedia/parsed-wiki_SUCCESS")
def run(self): archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4') if is_aws_authenticated(): s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4' shell(f'aws s3 cp {s3_location} {archive}') else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4' shell(f'wget -O {archive} {https_location}') shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/') shell(f'rm {archive}') shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
def run(self): protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH) quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH) quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH) quizdb_subcategories = QuizdbOrg.parse_subcategories( QDB_SUBCATEGORIES_PATH) quizdb_questions = QuizdbOrg.parse_tossups(quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH) qanta_questions = merge_datasets(protobowl_questions, quizdb_questions) with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): db = QuestionDatabase(QB_QUESTION_DB) questions = db.all_questions() with open(safe_path(EXPO_QUESTIONS), 'w', newline='') as f: f.write('id,answer,sent,text\n') writer = csv.writer(f, delimiter=',') for q in questions.values(): if q.fold != 'test': continue max_sent = max(q.text.keys()) for i in range(max_sent + 1): writer.writerow( [q.qnum, format_guess(q.page), i, q.text[i]])
def build(cls, documents: Dict[str, str], index_path=WHOOSH_WIKI_INDEX_PATH): ix = index.create_in(safe_path(index_path), cls.schema) writer = ix.writer() cw = CachedWikipedia() print("Building whoosh wiki index from {0} pages".format( len(documents))) bar = progressbar.ProgressBar() for p in bar(documents): writer.add_document(page=p, content=cw[p].content, quiz_bowl=documents[p]) writer.commit()
def write_feature_df(feature_df, feature_names: list): log.info('Beginning write job') for fold in c.VW_FOLDS: feature_df_with_fold = feature_df.filter( feature_df.fold == fold).cache() for name in feature_names: filename = safe_path('output/features/{}/{}.parquet'.format( fold, name)) feature_df_with_fold\ .filter('feature_name = "{}"'.format(name))\ .write\ .partitionBy('qnum')\ .parquet(filename, mode='overwrite') feature_df_with_fold.unpersist()
def create_wikipedia_cache(dump_path): from qanta.spark import create_spark_session spark = create_spark_session() db = QuestionDatabase() answers = set(db.all_answers().values()) b_answers = spark.sparkContext.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH)) def create_page(row): title = normalize_wikipedia_title(row.title) filter_answers = b_answers.value if title in filter_answers: page = WikipediaPage(title, row.text, None, None, row.id, row.url) write_page(page, page_path=page_path) spark.read.json(dump_path).rdd.foreach(create_page)
def main(questions, n_keep, ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(['guessdev']) questions = {x.qnum: x for x in questions} checkpoint = defaultdict(dict) for qnum, question in questions.items(): text_before = question.flatten_text() guesses_before = guesser.guess_single(text_before) text_after, guesses_after, removed = greedy_remove( text_before, guesses_before, n_keep) checkpoint[qnum]['text_before'] = text_before checkpoint[qnum]['text_after'] = text_after checkpoint[qnum]['guesses_before'] = guesses_before checkpoint[qnum]['guesses_after'] = guesses_after checkpoint[qnum]['removed'] = removed checkpoint = dict(checkpoint) with open(safe_path(ckp_dir), 'wb') as f: pickle.dump(checkpoint, f) evaluate(ckp_dir)
def main(questions, n_keep, ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(["guessdev"]) questions = {x.qnum: x for x in questions} checkpoint = defaultdict(dict) for qnum, question in questions.items(): text_before = question.flatten_text() guesses_before = guesser.guess_single(text_before) text_after, guesses_after, removed = greedy_remove( text_before, guesses_before, n_keep ) checkpoint[qnum]["text_before"] = text_before checkpoint[qnum]["text_after"] = text_after checkpoint[qnum]["guesses_before"] = guesses_before checkpoint[qnum]["guesses_after"] = guesses_after checkpoint[qnum]["removed"] = removed checkpoint = dict(checkpoint) with open(safe_path(ckp_dir), "wb") as f: pickle.dump(checkpoint, f) evaluate(ckp_dir)
def save(self, directory: str) -> None: model_path = safe_path(os.path.join(directory, 'vw_guesser.vw')) shell(f'mv {self.model_file}.vw {model_path}') self.model_file = model_path data = { 'label_to_i': self.label_to_i, 'i_to_label': self.i_to_label, 'max_label': self.max_label, 'multiclass_one_against_all': self.multiclass_one_against_all, 'multiclass_online_trees': self.multiclass_online_trees, 'l1': self.l1, 'l2': self.l2, 'passes': self.passes, 'learning_rate': self.learning_rate, 'decay_learning_rate': self.decay_learning_rate, 'bits': self.bits, 'ngrams': self.ngrams, 'skips': self.skips, 'config_num': self.config_num, 'random_seed': self.random_seed } data_pickle_path = os.path.join(directory, 'vw_guesser.pickle') with open(data_pickle_path, 'wb') as f: pickle.dump(data, f)
def save(self, directory: str) -> None: model_path = safe_path(os.path.join(directory, "vw_guesser.vw")) shell(f"mv {self.model_file}.vw {model_path}") self.model_file = model_path data = { "label_to_i": self.label_to_i, "i_to_label": self.i_to_label, "max_label": self.max_label, "multiclass_one_against_all": self.multiclass_one_against_all, "multiclass_online_trees": self.multiclass_online_trees, "l1": self.l1, "l2": self.l2, "passes": self.passes, "learning_rate": self.learning_rate, "decay_learning_rate": self.decay_learning_rate, "bits": self.bits, "ngrams": self.ngrams, "skips": self.skips, "config_num": self.config_num, "random_seed": self.random_seed, } data_pickle_path = os.path.join(directory, "vw_guesser.pickle") with open(data_pickle_path, "wb") as f: pickle.dump(data, f)
def reporting_path(guesser_module: str, guesser_class: str, config_num: int, file: str): guesser_path = "{}.{}".format(guesser_module, guesser_class) return safe_path( os.path.join(c.GUESSER_REPORTING_PREFIX, guesser_path, str(config_num), file))
def output_path(guesser_module: str, guesser_class: str, config_num: int, file: str): guesser_path = "{}.{}".format(guesser_module, guesser_class) return safe_path( os.path.join(c.GUESSER_TARGET_PREFIX, guesser_path, str(config_num), file))
def reporting_path(guesser_module: str, guesser_class: str, config_num: int, file: str): guesser_path = '{}.{}'.format(guesser_module, guesser_class) return safe_path(os.path.join( c.GUESSER_REPORTING_PREFIX, guesser_path, str(config_num), file ))
def run(self): make_dirs(safe_path("output/predictions/")) make_dirs(safe_path("output/expo/")) make_dirs(safe_path("output/vw_input/")) config = conf["buzzer"]["config"] buzzer_test.generate(config, [self.fold])
def run(self): make_dirs(safe_path("output/buzzers/")) train_cost_sensitive(conf["buzzer"]["config"], c.BUZZER_GENERATION_FOLDS)
def n_guesser_report(report_path, fold, n_samples=10): qdb = QuestionDatabase() question_lookup = qdb.all_questions() questions = [q for q in question_lookup.values() if q.fold == fold] guess_dataframes = [] folds = [fold] for g_spec in AbstractGuesser.list_enabled_guessers(): path = AbstractGuesser.output_path(g_spec.guesser_module, g_spec.guesser_class, '') guess_dataframes.append(AbstractGuesser.load_guesses(path, folds=folds)) df = pd.concat(guess_dataframes) # type: pd.DataFrame guessers = set(df['guesser'].unique()) n_guessers = len(guessers) guesses = [] for name, group in df.groupby(['guesser', 'qnum', 'sentence', 'token']): top_guess = group.sort_values('score', ascending=False).iloc[0] guesses.append(top_guess) top_df = pd.DataFrame.from_records(guesses) guess_lookup = {} for name, group in top_df.groupby(['qnum', 'sentence', 'token']): guess_lookup[name] = group performance = {} question_positions = {} n_correct_samples = defaultdict(list) for q in questions: page = q.page positions = [(sent, token) for sent, token, _ in q.partials()] # Since partials() passes word_skip=-1 each entry is guaranteed to be a sentence n_sentences = len(positions) q_positions = { 'start': 1, 'p_25': max(1, round(n_sentences * .25)), 'p_50': max(1, round(n_sentences * .5)), 'p_75': max(1, round(n_sentences * .75)), 'end': len(positions) } question_positions[q.qnum] = q_positions for sent, token in positions: key = (q.qnum, sent, token) if key in guess_lookup: guesses = guess_lookup[key] n_correct = (guesses.guess == page).sum() n_correct_samples[n_correct].append(key) if n_correct == 0: correct_guessers = 'None' elif n_correct == n_guessers: correct_guessers = 'All' else: correct_guessers = '/'.join( sorted(guesses[guesses.guess == page].guesser.values)) else: n_correct = 0 correct_guessers = 'None' performance[key] = (n_correct, correct_guessers) start_accuracies = [] p_25_accuracies = [] p_50_accuracies = [] p_75_accuracies = [] end_accuracies = [] for q in questions: qnum = q.qnum start_pos = question_positions[qnum]['start'] p_25_pos = question_positions[qnum]['p_25'] p_50_pos = question_positions[qnum]['p_50'] p_75_pos = question_positions[qnum]['p_75'] end_pos = question_positions[qnum]['end'] start_accuracies.append((*performance[(qnum, start_pos, 0)], 'start')) p_25_accuracies.append((*performance[(qnum, p_25_pos, 0)], 'p_25')) p_50_accuracies.append((*performance[(qnum, p_50_pos, 0)], 'p_50')) p_75_accuracies.append((*performance[(qnum, p_75_pos, 0)], 'p_75')) end_accuracies.append((*performance[(qnum, end_pos, 0)], 'end')) all_accuracies = start_accuracies + p_25_accuracies + p_50_accuracies + p_75_accuracies + end_accuracies perf_df = pd.DataFrame.from_records( all_accuracies, columns=['n_guessers_correct', 'correct_guessers', 'position']) perf_df['count'] = 1 n_questions = len(questions) aggregate_df = (perf_df.groupby( ['position', 'n_guessers_correct', 'correct_guessers']).count() / n_questions).reset_index() fig, ax = plt.subplots(figsize=(12, 8), nrows=2, ncols=3, sharey=True, sharex=True) positions = { 'start': (0, 0), 'p_25': (0, 1), 'p_50': (1, 0), 'p_75': (1, 1), 'end': (1, 2) } position_labels = { 'start': 'Start', 'p_25': '25%', 'p_50': '50%', 'p_75': '75%', 'end': '100%' } ax[(0, 2)].axis('off') for p, key in positions.items(): data = aggregate_df[aggregate_df.position == p].pivot( index='n_guessers_correct', columns='correct_guessers').fillna(0)['count'] plot_ax = ax[key] data.plot.bar(stacked=True, ax=plot_ax, title='Question Position: {}'.format(position_labels[p])) handles, labels = plot_ax.get_legend_handles_labels() ax_legend = plot_ax.legend() ax_legend.set_visible(False) plot_ax.set(xlabel='Number of Correct Guessers', ylabel='Accuracy') for plot_ax in list(ax.flatten()): for tk in plot_ax.get_yticklabels(): tk.set_visible(True) for tk in plot_ax.get_xticklabels(): tk.set_rotation('horizontal') fig.legend(handles, labels, bbox_to_anchor=(.8, .75)) fig.suptitle('Accuracy Breakdown by Guesser') accuracy_by_n_correct_plot_path = '/tmp/accuracy_by_n_correct_{}.png'.format( fold) fig.savefig(accuracy_by_n_correct_plot_path, dpi=200) sampled_questions_by_correct = sample_n_guesser_correct_questions( question_lookup, guess_lookup, n_correct_samples, n_samples=n_samples) report = ReportGenerator('compare_guessers.md') report.create( { 'dev_accuracy_by_n_correct_plot': accuracy_by_n_correct_plot_path, 'sampled_questions_by_correct': sampled_questions_by_correct }, safe_path(report_path))
def run(self): make_dirs(safe_path('output/buzzers/')) train_cost_sensitive(conf['buzzer']['config'], c.BUZZER_GENERATION_FOLDS)
def output_path(guesser_module: str, guesser_class: str, config_num: int, file: str): guesser_path = '{}.{}'.format(guesser_module, guesser_class) return safe_path(os.path.join( c.GUESSER_TARGET_PREFIX, guesser_path, str(config_num), file ))
def output_path(guesser_module: str, guesser_class: str, file: str): guesser_path = '{}.{}'.format(guesser_module, guesser_class) return safe_path( os.path.join(c.GUESSER_TARGET_PREFIX, guesser_path, file))
def run(self): make_dirs(safe_path('output/predictions/')) make_dirs(safe_path('output/expo/')) make_dirs(safe_path('output/vw_input/')) config=conf['buzzer']['config'] buzzer_test.generate(config, [self.fold])