def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)['questions'] with open('data/external/high_school_project/quizdb-20190313164802.json') as f: raw_questions = json.load(f)['data']['tossups'] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ 'qanta_id': idx, 'text': q['text'], 'answer': q['answer'], 'page': None, 'category': None, 'subcategory': None, 'tournament': q['tournament']['name'], 'difficulty': q['tournament']['difficulty'], 'year': int(q['tournament']['year']), 'proto_id': None, 'qdb_id': q['id'], 'dataset': 'quizdb.org', 'fold': 'guesstest' }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions) with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, 'data/external/high_school_project/answer_map.json', 'data/external/high_school_project/unbound_answers.json' ) with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions( new_questions, answer_map, amb_answer_map, unmappable, page_assigner ) add_sentences_(new_questions) with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open('data/external/high_school_project/mapping_report.json', 'w') as f: json.dump(mapping_report, f)
def main(): titles = read_wiki_titles() assigner = PageAssigner() log.info('Checking direct protobowl mappings...') for page in assigner.protobowl_direct.values(): check_page(page, titles) log.info('Checking direct quizdb mappings...') for page in assigner.quizdb_direct.values(): check_page(page, titles) log.info('Checking unambiguous mappings...') for page in assigner.unambiguous.values(): check_page(page, titles) log.info('Checking ambiguous mappings...') for entry in assigner.ambiguous.values(): for option in entry: check_page(option['page'], titles)
def run(self): with open(ANSWER_MAP_PATH) as f: content = json.load(f) answer_map = content['answer_map'] ambig_answer_map = content['ambig_answer_map'] with open(QANTA_FOLDED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions(qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner) with open(QANTA_MAPPED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f) with open(QANTA_MAP_REPORT_PATH, 'w') as f: json.dump(mapping_report, f)
def unmapped_to_mapped_questions( unmapped_qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner: PageAssigner, ): proto_unmappable = set(unmappable["proto"]) qdb_unmappable = set(unmappable["quizdb"]) train_unmatched_questions = [] test_unmatched_questions = [] match_report = {} for q in unmapped_qanta_questions: answer = q["answer"] qanta_id = int(q["qanta_id"]) proto_id = q["proto_id"] qdb_id = q["qdb_id"] fold = q["fold"] if proto_id in proto_unmappable or qdb_id in qdb_unmappable: match_report[qanta_id] = { "result": "none", "annotated_error": None, "automatic_error": "Unmappable answer", "annotated_page": None, "automatic_page": None, } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) continue annotated_page, annotated_error = page_assigner.maybe_assign( answer=answer, question_text=q["text"], qdb_id=qdb_id, proto_id=proto_id) automatic_page = answer_map[answer] if answer in answer_map else None ambig_automatic_error = None ambig_automatic_page = None if answer in ambig_answer_map: words = set(q["text"].lower().split()) options = ambig_answer_map[answer] ambig_automatic_page = None for page, keyword in options: if keyword in words: if ambig_automatic_page is None and ambig_automatic_error is None: ambig_automatic_page = page else: if ambig_automatic_error is None: ambig_automatic_page = None ambig_automatic_error = "Ambig Matches: " + page else: ambig_automatic_page = None ambig_automatic_error += " " + page automatic_error = None if automatic_page is not None and ambig_automatic_page is not None: if automatic_page != ambig_automatic_page: # Use automatic_page, but emit a warning automatic_error = ( f"Ambiguity Warning: {automatic_page} {ambig_automatic_page}" ) elif automatic_page is None and ambig_automatic_page is not None: # This is the safe case where we attempt to match when there isn't a match already automatic_page = ambig_automatic_page elif automatic_page is not None and ambig_automatic_page is None: # Do nothing here since by default automatic_page is used pass else: # if both are None, then there is no automatic match automatic_error = "No match" if (annotated_page is None) and (automatic_page is None): match_report[qanta_id] = { "result": "none", "annotated_error": annotated_error, "automatic_error": automatic_error, "annotated_page": annotated_page, "automatic_page": automatic_page, } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) elif (annotated_page is not None) and (automatic_page is None): q["page"] = annotated_page match_report[qanta_id] = { "result": "annotated", "annotated_error": annotated_error, "automatic_error": automatic_error, "annotated_page": annotated_page, "automatic_page": automatic_page, } elif (annotated_page is None) and (automatic_page is not None): q["page"] = automatic_page match_report[qanta_id] = { "result": "automatic", "annotated_error": annotated_error, "automatic_error": automatic_error, "annotated_page": annotated_page, "automatic_page": automatic_page, } else: if annotated_page == automatic_page: q["page"] = automatic_page match_report[qanta_id] = { "result": "annotated+automatic", "annotated_error": annotated_error, "automatic_error": automatic_error, "annotated_page": annotated_page, "automatic_page": automatic_page, } else: q["page"] = annotated_page match_report[qanta_id] = { "result": "disagree", "annotated_error": annotated_error, "automatic_error": automatic_error, "annotated_page": annotated_page, "automatic_page": automatic_page, } return { "train_unmatched": train_unmatched_questions, "test_unmatched": test_unmatched_questions, "match_report": match_report, }
def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)["questions"] with open("data/external/high_school_project/quizdb-20190313164802.json" ) as f: raw_questions = json.load(f)["data"]["tossups"] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ "qanta_id": idx, "text": q["text"], "answer": q["answer"], "page": None, "category": None, "subcategory": None, "tournament": q["tournament"]["name"], "difficulty": q["tournament"]["difficulty"], "year": int(q["tournament"]["year"]), "proto_id": None, "qdb_id": q["id"], "dataset": "quizdb.org", "fold": "guesstest", }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map( questions) with safe_open("data/external/high_school_project/automatic_report.json", "w") as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, "data/external/high_school_project/answer_map.json", "data/external/high_school_project/unbound_answers.json", ) with open("data/internal/page_assignment/unmappable.yaml") as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions(new_questions, answer_map, amb_answer_map, unmappable, page_assigner) add_sentences_(new_questions) with open( "data/external/high_school_project/qanta.acf-regionals-2018.json", "w") as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open("data/external/high_school_project/mapping_report.json", "w") as f: json.dump(mapping_report, f)
def unmapped_to_mapped_questions(unmapped_qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner: PageAssigner): proto_unmappable = set(unmappable['proto']) qdb_unmappable = set(unmappable['quizdb']) train_unmatched_questions = [] test_unmatched_questions = [] match_report = {} for q in unmapped_qanta_questions: answer = q['answer'] qanta_id = int(q['qanta_id']) proto_id = q['proto_id'] qdb_id = q['qdb_id'] fold = q['fold'] if proto_id in proto_unmappable or qdb_id in qdb_unmappable: match_report[qanta_id] = { 'result': 'none', 'annotated_error': None, 'automatic_error': 'Unmappable answer', 'annotated_page': None, 'automatic_page': None } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) continue annotated_page, annotated_error = page_assigner.maybe_assign( answer=answer, question_text=q['text'], qdb_id=qdb_id, proto_id=proto_id) automatic_page = answer_map[answer] if answer in answer_map else None ambig_automatic_error = None ambig_automatic_page = None if answer in ambig_answer_map: words = set(q['text'].lower().split()) options = ambig_answer_map[answer] ambig_automatic_page = None for page, keyword in options: if keyword in words: if ambig_automatic_page is None and ambig_automatic_error is None: ambig_automatic_page = page else: if ambig_automatic_error is None: ambig_automatic_page = None ambig_automatic_error = 'Ambig Matches: ' + page else: ambig_automatic_page = None ambig_automatic_error += ' ' + page automatic_error = None if automatic_page is not None and ambig_automatic_page is not None: if automatic_page != ambig_automatic_page: # Use automatic_page, but emit a warning automatic_error = f'Ambiguity Warning: {automatic_page} {ambig_automatic_page}' elif automatic_page is None and ambig_automatic_page is not None: # This is the safe case where we attempt to match when there isn't a match already automatic_page = ambig_automatic_page elif automatic_page is not None and ambig_automatic_page is None: # Do nothing here since by default automatic_page is used pass else: # if both are None, then there is no automatic match automatic_error = 'No match' if (annotated_page is None) and (automatic_page is None): match_report[qanta_id] = { 'result': 'none', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) elif (annotated_page is not None) and (automatic_page is None): q['page'] = annotated_page match_report[qanta_id] = { 'result': 'annotated', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } elif (annotated_page is None) and (automatic_page is not None): q['page'] = automatic_page match_report[qanta_id] = { 'result': 'automatic', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } else: if annotated_page == automatic_page: q['page'] = automatic_page match_report[qanta_id] = { 'result': 'annotated+automatic', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } else: q['page'] = annotated_page match_report[qanta_id] = { 'result': 'disagree', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } return { 'train_unmatched': train_unmatched_questions, 'test_unmatched': test_unmatched_questions, 'match_report': match_report }
def unmapped_to_mapped_questions(unmapped_qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner: PageAssigner): proto_unmappable = set(unmappable['proto']) qdb_unmappable = set(unmappable['quizdb']) train_unmatched_questions = [] test_unmatched_questions = [] match_report = {} for q in unmapped_qanta_questions: answer = q['answer'] qanta_id = int(q['qanta_id']) proto_id = q['proto_id'] qdb_id = q['qdb_id'] fold = q['fold'] if proto_id in proto_unmappable or qdb_id in qdb_unmappable: match_report[qanta_id] = { 'result': 'none', 'annotated_error': None, 'automatic_error': 'Unmappable answer', 'annotated_page': None, 'automatic_page': None } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) continue annotated_page, annotated_error = page_assigner.maybe_assign( answer=answer, question_text=q['text'], qdb_id=qdb_id, proto_id=proto_id ) automatic_page = answer_map[answer] if answer in answer_map else None ambig_automatic_error = None ambig_automatic_page = None if answer in ambig_answer_map: words = set(q['text'].lower().split()) options = ambig_answer_map[answer] ambig_automatic_page = None for page, keyword in options: if keyword in words: if ambig_automatic_page is None and ambig_automatic_error is None: ambig_automatic_page = page else: if ambig_automatic_error is None: ambig_automatic_page = None ambig_automatic_error = 'Ambig Matches: ' + page else: ambig_automatic_page = None ambig_automatic_error += ' ' + page automatic_error = None if automatic_page is not None and ambig_automatic_page is not None: if automatic_page != ambig_automatic_page: # Use automatic_page, but emit a warning automatic_error = f'Ambiguity Warning: {automatic_page} {ambig_automatic_page}' elif automatic_page is None and ambig_automatic_page is not None: # This is the safe case where we attempt to match when there isn't a match already automatic_page = ambig_automatic_page elif automatic_page is not None and ambig_automatic_page is None: # Do nothing here since by default automatic_page is used pass else: # if both are None, then there is no automatic match automatic_error = 'No match' if (annotated_page is None) and (automatic_page is None): match_report[qanta_id] = { 'result': 'none', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD: train_unmatched_questions.append(q) else: test_unmatched_questions.append(q) elif (annotated_page is not None) and (automatic_page is None): q['page'] = annotated_page match_report[qanta_id] = { 'result': 'annotated', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } elif (annotated_page is None) and (automatic_page is not None): q['page'] = automatic_page match_report[qanta_id] = { 'result': 'automatic', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } else: if annotated_page == automatic_page: q['page'] = automatic_page match_report[qanta_id] = { 'result': 'annotated+automatic', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } else: q['page'] = annotated_page match_report[qanta_id] = { 'result': 'disagree', 'annotated_error': annotated_error, 'automatic_error': automatic_error, 'annotated_page': annotated_page, 'automatic_page': automatic_page } return { 'train_unmatched': train_unmatched_questions, 'test_unmatched': test_unmatched_questions, 'match_report': match_report }