def adversarial_to_json(adversarial_json, json_dir): from qanta.datasets.quiz_bowl import QantaDatabase db = QantaDatabase() lookup = {q.page.lower(): q.page for q in db.mapped_questions} with open(adversarial_json) as f: questions = json.load(f) rows = [] for i, q in enumerate(questions): answer = q['answer'].strip().replace(' ', '_') if answer in lookup: answer = lookup[answer] else: log.warning(f'Could not find: {answer}') rows.append({ 'text': q['question'].strip(), 'page': answer, 'answer': '', 'qanta_id': 1000000 + i, 'proto_id': None, 'qdb_id': None, 'category': '', 'subcategory': '', 'tournament': '', 'difficulty': '', 'dataset': 'adversarial', 'year': -1, 'fold': 'expo', 'gameplay': False }) from qanta.ingestion.preprocess import add_sentences_, format_qanta_json from qanta.util.constants import DS_VERSION add_sentences_(rows, parallel=False) with open(path.join(json_dir, f'qanta.expo.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(rows, DS_VERSION), f)
def create_wikipedia_redirect_pickle(redirect_csv, output_pickle): countries = {} with open(COUNTRY_LIST_PATH) as f: for line in f: k, v = line.split('\t') countries[k] = v.strip() db = QantaDatabase() pages = {q.page for q in db.train_questions} with open(redirect_csv) as redirect_f: redirects = {} n_total = 0 n_selected = 0 for row in csv.reader(redirect_f, quotechar='"', escapechar='\\'): n_total += 1 source = row[0] target = row[1] if (target not in pages or source in countries or target.startswith('WikiProject') or target.endswith("_topics") or target.endswith("_(overview)")): continue else: redirects[source] = target n_selected += 1 log.info( 'Filtered {} raw wikipedia redirects to {} matching redirects'. format(n_total, n_selected)) with open(output_pickle, 'wb') as output_f: pickle.dump(redirects, output_f)
def create_wikipedia_cache( parsed_wiki_path='data/external/wikipedia/parsed-wiki', output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QantaDatabase() train_questions = db.train_questions answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, '*', '*') def parse_page(json_text): page = json.loads(json_text) return { 'id': int(page['id']), 'title': page['title'].replace(' ', '_'), 'text': page['text'], 'url': page['url'] } wiki_pages = sc.textFile(page_pattern).map(parse_page).filter( lambda p: p['title'] in b_answers.value).collect() wiki_lookup = {p['title']: p for p in wiki_pages} with open(output_path, 'w') as f: json.dump(wiki_lookup, f) return wiki_lookup
def create_answer_mapping_csvs(output_dir='data/external/answer_mapping'): with open(QANTA_MAP_REPORT_PATH) as f: report = json.load(f) match_report = report['match_report'] db = QantaDatabase() qb_lookup: Dict[int, Question] = {q.qanta_id: q for q in db.all_questions} train_rows = unmapped_rows(match_report, report['train_unmatched']) test_rows = unmapped_rows(match_report, report['test_unmatched']) train_df = pd.DataFrame.from_records(train_rows, columns=UNMAPPED_COLUMNS) test_df = pd.DataFrame.from_records(test_rows, columns=UNMAPPED_COLUMNS) train_df.to_csv(os.path.join(output_dir, 'unmapped_train.csv')) test_df.to_csv(os.path.join(output_dir, 'unmapped_test.csv')) disagree_rows = [] for qanta_id, row in match_report.items(): if row['result'] == 'disagree': q = qb_lookup[int(qanta_id)] start, end = q.tokenizations[-1] is_train = q.fold == GUESSER_TRAIN_FOLD or q.fold == BUZZER_TRAIN_FOLD disagree_rows.append( ('disagree', None, q.proto_id, q.qdb_id, q.qanta_id, is_train, q.text[start:end], q.answer, row['automatic_page'], row['annotated_page'])) disagree_df = pd.DataFrame.from_records(disagree_rows, columns=DISAGREE_COLUMNS) disagree_df[disagree_df.is_train == True].to_csv( os.path.join(output_dir, 'disagree_train.csv')) disagree_df[disagree_df.is_train == False].to_csv( os.path.join(output_dir, 'disagree_test.csv'))
def create_wikipedia_cache( parsed_wiki_path="data/external/wikipedia/parsed-wiki", output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QantaDatabase() train_questions = db.train_questions answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, "*", "*") def parse_page(json_text): page = json.loads(json_text) return { "id": int(page["id"]), "title": page["title"].replace(" ", "_"), "text": page["text"], "url": page["url"], } wiki_pages = (sc.textFile(page_pattern).map(parse_page).filter( lambda p: p["title"] in b_answers.value).collect()) wiki_lookup = {p["title"]: p for p in wiki_pages} with open(output_path, "w") as f: json.dump(wiki_lookup, f) return wiki_lookup
def create_report(self, directory: str, fold): with open(os.path.join(directory, f"guesser_params.pickle"), "rb") as f: params = pickle.load(f) qdb = QantaDatabase() guesser_train = qdb.guess_train_questions questions_by_fold = qdb.by_fold() guesser_report_questions = questions_by_fold[fold] train_pages = {q.page for q in guesser_train} dev_pages = {q.page for q in guesser_report_questions} unanswerable_answer_percent = len(dev_pages - train_pages) / len(dev_pages) answerable = 0 for q in guesser_report_questions: if q.page in train_pages: answerable += 1 unanswerable_question_percent = 1 - answerable / len( guesser_report_questions) train_example_counts = Counter() for q in guesser_train: train_example_counts[q.page] += 1 dev_df = pd.DataFrame({ "page": [q.page for q in guesser_report_questions], "qanta_id": [q.qanta_id for q in guesser_report_questions], "text_length": [len(q.text) for q in guesser_report_questions], "n_train": [train_example_counts[q.page] for q in guesser_report_questions], "category": [q.category for q in guesser_report_questions], }) char_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type="char") char_df = char_guess_df.merge(dev_df, on="qanta_id") char_df["correct"] = (char_df.guess == char_df.page).astype("int") char_df["char_percent"] = (char_df["char_index"] / char_df["text_length"]).clip_upper(1.0) first_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type="first") first_df = first_guess_df.merge(dev_df, on="qanta_id").sort_values( "score", ascending=False) first_df["correct"] = (first_df.guess == first_df.page).astype("int") grouped_first_df = first_df.groupby("qanta_id") first_accuracy = grouped_first_df.nth(0).correct.mean() first_recall = grouped_first_df.agg({"correct": "max"}).correct.mean() full_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type="full") full_df = full_guess_df.merge(dev_df, on="qanta_id").sort_values( "score", ascending=False) full_df["correct"] = (full_df.guess == full_df.page).astype("int") grouped_full_df = full_df.groupby("qanta_id") full_accuracy = grouped_full_df.nth(0).correct.mean() full_recall = grouped_full_df.agg({"correct": "max"}).correct.mean() with open(os.path.join(directory, f"guesser_report_{fold}.pickle"), "wb") as f: pickle.dump( { "first_accuracy": first_accuracy, "first_recall": first_recall, "full_accuracy": full_accuracy, "full_recall": full_recall, "char_df": char_df, "first_df": first_df, "full_df": full_df, "n_guesses": conf["n_guesses"], "unanswerable_answer_percent": unanswerable_answer_percent, "unanswerable_question_percent": unanswerable_question_percent, "guesser_name": self.display_name(), "guesser_params": params, }, f, )
def create_report(self, directory: str, fold): with open(os.path.join(directory, f'guesser_params.pickle'), 'rb') as f: params = pickle.load(f) qdb = QantaDatabase() guesser_train = qdb.guess_train_questions questions_by_fold = qdb.by_fold() guesser_report_questions = questions_by_fold[fold] train_pages = {q.page for q in guesser_train} dev_pages = {q.page for q in guesser_report_questions} unanswerable_answer_percent = len(dev_pages - train_pages) / len(dev_pages) answerable = 0 for q in guesser_report_questions: if q.page in train_pages: answerable += 1 unanswerable_question_percent = 1 - answerable / len(guesser_report_questions) train_example_counts = Counter() for q in guesser_train: train_example_counts[q.page] += 1 dev_df = pd.DataFrame({ 'page': [q.page for q in guesser_report_questions], 'qanta_id': [q.qanta_id for q in guesser_report_questions], 'text_length': [len(q.text) for q in guesser_report_questions], 'n_train': [train_example_counts[q.page] for q in guesser_report_questions], 'category': [q.category for q in guesser_report_questions] }) char_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type='char') char_df = char_guess_df.merge(dev_df, on='qanta_id') char_df['correct'] = (char_df.guess == char_df.page).astype('int') char_df['char_percent'] = (char_df['char_index'] / char_df['text_length']).clip_upper(1.0) first_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type='first') first_df = first_guess_df.merge(dev_df, on='qanta_id').sort_values('score', ascending=False) first_df['correct'] = (first_df.guess == first_df.page).astype('int') grouped_first_df = first_df.groupby('qanta_id') first_accuracy = grouped_first_df.nth(0).correct.mean() first_recall = grouped_first_df.agg({'correct': 'max'}).correct.mean() full_guess_df = AbstractGuesser.load_guesses(directory, folds=[fold], output_type='full') full_df = full_guess_df.merge(dev_df, on='qanta_id').sort_values('score', ascending=False) full_df['correct'] = (full_df.guess == full_df.page).astype('int') grouped_full_df = full_df.groupby('qanta_id') full_accuracy = grouped_full_df.nth(0).correct.mean() full_recall = grouped_full_df.agg({'correct': 'max'}).correct.mean() with open(os.path.join(directory, f'guesser_report_{fold}.pickle'), 'wb') as f: pickle.dump({ 'first_accuracy': first_accuracy, 'first_recall': first_recall, 'full_accuracy': full_accuracy, 'full_recall': full_recall, 'char_df': char_df, 'first_df': first_df, 'full_df': full_df, 'n_guesses': conf['n_guesses'], 'unanswerable_answer_percent': unanswerable_answer_percent, 'unanswerable_question_percent': unanswerable_question_percent, 'guesser_name': self.display_name(), 'guesser_params': params }, f)