Пример #1
0
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)['questions']

    with open('data/external/high_school_project/quizdb-20190313164802.json') as f:
        raw_questions = json.load(f)['data']['tossups']

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            'qanta_id': idx,
            'text': q['text'],
            'answer': q['answer'],
            'page': None,
            'category': None,
            'subcategory': None,
            'tournament': q['tournament']['name'],
            'difficulty': q['tournament']['difficulty'],
            'year': int(q['tournament']['year']),
            'proto_id': None,
            'qdb_id': q['id'],
            'dataset': 'quizdb.org',
            'fold': 'guesstest'
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions)
    with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f:
        json.dump(report, f)

    write_answer_map(
        answer_map, amb_answer_map, unbound_answers,
        'data/external/high_school_project/answer_map.json',
        'data/external/high_school_project/unbound_answers.json'
    )
    with open('data/internal/page_assignment/unmappable.yaml') as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(
        new_questions,
        answer_map, amb_answer_map,
        unmappable, page_assigner
    )

    add_sentences_(new_questions)
    with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open('data/external/high_school_project/mapping_report.json', 'w') as f:
        json.dump(mapping_report, f)
Пример #2
0
def main():
    titles = read_wiki_titles()
    assigner = PageAssigner()
    log.info('Checking direct protobowl mappings...')
    for page in assigner.protobowl_direct.values():
        check_page(page, titles)

    log.info('Checking direct quizdb mappings...')
    for page in assigner.quizdb_direct.values():
        check_page(page, titles)

    log.info('Checking unambiguous mappings...')
    for page in assigner.unambiguous.values():
        check_page(page, titles)

    log.info('Checking ambiguous mappings...')
    for entry in assigner.ambiguous.values():
        for option in entry:
            check_page(option['page'], titles)
Пример #3
0
    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(qanta_questions,
                                                      answer_map,
                                                      ambig_answer_map,
                                                      unmappable,
                                                      page_assigner)

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)
Пример #4
0
def unmapped_to_mapped_questions(
    unmapped_qanta_questions,
    answer_map,
    ambig_answer_map,
    unmappable,
    page_assigner: PageAssigner,
):
    proto_unmappable = set(unmappable["proto"])
    qdb_unmappable = set(unmappable["quizdb"])
    train_unmatched_questions = []
    test_unmatched_questions = []
    match_report = {}
    for q in unmapped_qanta_questions:
        answer = q["answer"]
        qanta_id = int(q["qanta_id"])
        proto_id = q["proto_id"]
        qdb_id = q["qdb_id"]
        fold = q["fold"]

        if proto_id in proto_unmappable or qdb_id in qdb_unmappable:
            match_report[qanta_id] = {
                "result": "none",
                "annotated_error": None,
                "automatic_error": "Unmappable answer",
                "annotated_page": None,
                "automatic_page": None,
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
            continue

        annotated_page, annotated_error = page_assigner.maybe_assign(
            answer=answer,
            question_text=q["text"],
            qdb_id=qdb_id,
            proto_id=proto_id)
        automatic_page = answer_map[answer] if answer in answer_map else None
        ambig_automatic_error = None
        ambig_automatic_page = None
        if answer in ambig_answer_map:
            words = set(q["text"].lower().split())
            options = ambig_answer_map[answer]
            ambig_automatic_page = None
            for page, keyword in options:
                if keyword in words:
                    if ambig_automatic_page is None and ambig_automatic_error is None:
                        ambig_automatic_page = page
                    else:
                        if ambig_automatic_error is None:
                            ambig_automatic_page = None
                            ambig_automatic_error = "Ambig Matches: " + page
                        else:
                            ambig_automatic_page = None
                            ambig_automatic_error += " " + page

        automatic_error = None
        if automatic_page is not None and ambig_automatic_page is not None:
            if automatic_page != ambig_automatic_page:
                # Use automatic_page, but emit a warning
                automatic_error = (
                    f"Ambiguity Warning: {automatic_page} {ambig_automatic_page}"
                )
        elif automatic_page is None and ambig_automatic_page is not None:
            # This is the safe case where we attempt to match when there isn't a match already
            automatic_page = ambig_automatic_page
        elif automatic_page is not None and ambig_automatic_page is None:
            # Do nothing here since by default automatic_page is used
            pass
        else:
            # if both are None, then there is no automatic match
            automatic_error = "No match"

        if (annotated_page is None) and (automatic_page is None):
            match_report[qanta_id] = {
                "result": "none",
                "annotated_error": annotated_error,
                "automatic_error": automatic_error,
                "annotated_page": annotated_page,
                "automatic_page": automatic_page,
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
        elif (annotated_page is not None) and (automatic_page is None):
            q["page"] = annotated_page
            match_report[qanta_id] = {
                "result": "annotated",
                "annotated_error": annotated_error,
                "automatic_error": automatic_error,
                "annotated_page": annotated_page,
                "automatic_page": automatic_page,
            }
        elif (annotated_page is None) and (automatic_page is not None):
            q["page"] = automatic_page
            match_report[qanta_id] = {
                "result": "automatic",
                "annotated_error": annotated_error,
                "automatic_error": automatic_error,
                "annotated_page": annotated_page,
                "automatic_page": automatic_page,
            }
        else:
            if annotated_page == automatic_page:
                q["page"] = automatic_page
                match_report[qanta_id] = {
                    "result": "annotated+automatic",
                    "annotated_error": annotated_error,
                    "automatic_error": automatic_error,
                    "annotated_page": annotated_page,
                    "automatic_page": automatic_page,
                }
            else:
                q["page"] = annotated_page
                match_report[qanta_id] = {
                    "result": "disagree",
                    "annotated_error": annotated_error,
                    "automatic_error": automatic_error,
                    "annotated_page": annotated_page,
                    "automatic_page": automatic_page,
                }

    return {
        "train_unmatched": train_unmatched_questions,
        "test_unmatched": test_unmatched_questions,
        "match_report": match_report,
    }
Пример #5
0
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)["questions"]

    with open("data/external/high_school_project/quizdb-20190313164802.json"
              ) as f:
        raw_questions = json.load(f)["data"]["tossups"]

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            "qanta_id": idx,
            "text": q["text"],
            "answer": q["answer"],
            "page": None,
            "category": None,
            "subcategory": None,
            "tournament": q["tournament"]["name"],
            "difficulty": q["tournament"]["difficulty"],
            "year": int(q["tournament"]["year"]),
            "proto_id": None,
            "qdb_id": q["id"],
            "dataset": "quizdb.org",
            "fold": "guesstest",
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
        questions)
    with safe_open("data/external/high_school_project/automatic_report.json",
                   "w") as f:
        json.dump(report, f)

    write_answer_map(
        answer_map,
        amb_answer_map,
        unbound_answers,
        "data/external/high_school_project/answer_map.json",
        "data/external/high_school_project/unbound_answers.json",
    )
    with open("data/internal/page_assignment/unmappable.yaml") as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(new_questions, answer_map,
                                                  amb_answer_map, unmappable,
                                                  page_assigner)

    add_sentences_(new_questions)
    with open(
            "data/external/high_school_project/qanta.acf-regionals-2018.json",
            "w") as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open("data/external/high_school_project/mapping_report.json",
              "w") as f:
        json.dump(mapping_report, f)
Пример #6
0
def unmapped_to_mapped_questions(unmapped_qanta_questions, answer_map,
                                 ambig_answer_map, unmappable,
                                 page_assigner: PageAssigner):
    proto_unmappable = set(unmappable['proto'])
    qdb_unmappable = set(unmappable['quizdb'])
    train_unmatched_questions = []
    test_unmatched_questions = []
    match_report = {}
    for q in unmapped_qanta_questions:
        answer = q['answer']
        qanta_id = int(q['qanta_id'])
        proto_id = q['proto_id']
        qdb_id = q['qdb_id']
        fold = q['fold']

        if proto_id in proto_unmappable or qdb_id in qdb_unmappable:
            match_report[qanta_id] = {
                'result': 'none',
                'annotated_error': None,
                'automatic_error': 'Unmappable answer',
                'annotated_page': None,
                'automatic_page': None
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
            continue

        annotated_page, annotated_error = page_assigner.maybe_assign(
            answer=answer,
            question_text=q['text'],
            qdb_id=qdb_id,
            proto_id=proto_id)
        automatic_page = answer_map[answer] if answer in answer_map else None
        ambig_automatic_error = None
        ambig_automatic_page = None
        if answer in ambig_answer_map:
            words = set(q['text'].lower().split())
            options = ambig_answer_map[answer]
            ambig_automatic_page = None
            for page, keyword in options:
                if keyword in words:
                    if ambig_automatic_page is None and ambig_automatic_error is None:
                        ambig_automatic_page = page
                    else:
                        if ambig_automatic_error is None:
                            ambig_automatic_page = None
                            ambig_automatic_error = 'Ambig Matches: ' + page
                        else:
                            ambig_automatic_page = None
                            ambig_automatic_error += ' ' + page

        automatic_error = None
        if automatic_page is not None and ambig_automatic_page is not None:
            if automatic_page != ambig_automatic_page:
                # Use automatic_page, but emit a warning
                automatic_error = f'Ambiguity Warning: {automatic_page} {ambig_automatic_page}'
        elif automatic_page is None and ambig_automatic_page is not None:
            # This is the safe case where we attempt to match when there isn't a match already
            automatic_page = ambig_automatic_page
        elif automatic_page is not None and ambig_automatic_page is None:
            # Do nothing here since by default automatic_page is used
            pass
        else:
            # if both are None, then there is no automatic match
            automatic_error = 'No match'

        if (annotated_page is None) and (automatic_page is None):
            match_report[qanta_id] = {
                'result': 'none',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
        elif (annotated_page is not None) and (automatic_page is None):
            q['page'] = annotated_page
            match_report[qanta_id] = {
                'result': 'annotated',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
        elif (annotated_page is None) and (automatic_page is not None):
            q['page'] = automatic_page
            match_report[qanta_id] = {
                'result': 'automatic',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
        else:
            if annotated_page == automatic_page:
                q['page'] = automatic_page
                match_report[qanta_id] = {
                    'result': 'annotated+automatic',
                    'annotated_error': annotated_error,
                    'automatic_error': automatic_error,
                    'annotated_page': annotated_page,
                    'automatic_page': automatic_page
                }
            else:
                q['page'] = annotated_page
                match_report[qanta_id] = {
                    'result': 'disagree',
                    'annotated_error': annotated_error,
                    'automatic_error': automatic_error,
                    'annotated_page': annotated_page,
                    'automatic_page': automatic_page
                }

    return {
        'train_unmatched': train_unmatched_questions,
        'test_unmatched': test_unmatched_questions,
        'match_report': match_report
    }
Пример #7
0
def unmapped_to_mapped_questions(unmapped_qanta_questions, answer_map, ambig_answer_map,
                                 unmappable, page_assigner: PageAssigner):
    proto_unmappable = set(unmappable['proto'])
    qdb_unmappable = set(unmappable['quizdb'])
    train_unmatched_questions = []
    test_unmatched_questions = []
    match_report = {}
    for q in unmapped_qanta_questions:
        answer = q['answer']
        qanta_id = int(q['qanta_id'])
        proto_id = q['proto_id']
        qdb_id = q['qdb_id']
        fold = q['fold']

        if proto_id in proto_unmappable or qdb_id in qdb_unmappable:
            match_report[qanta_id] = {
                'result': 'none',
                'annotated_error': None,
                'automatic_error': 'Unmappable answer',
                'annotated_page': None,
                'automatic_page': None
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
            continue

        annotated_page, annotated_error = page_assigner.maybe_assign(
            answer=answer, question_text=q['text'], qdb_id=qdb_id, proto_id=proto_id
        )
        automatic_page = answer_map[answer] if answer in answer_map else None
        ambig_automatic_error = None
        ambig_automatic_page = None
        if answer in ambig_answer_map:
            words = set(q['text'].lower().split())
            options = ambig_answer_map[answer]
            ambig_automatic_page = None
            for page, keyword in options:
                if keyword in words:
                    if ambig_automatic_page is None and ambig_automatic_error is None:
                        ambig_automatic_page = page
                    else:
                        if ambig_automatic_error is None:
                            ambig_automatic_page = None
                            ambig_automatic_error = 'Ambig Matches: ' + page
                        else:
                            ambig_automatic_page = None
                            ambig_automatic_error += ' ' + page

        automatic_error = None
        if automatic_page is not None and ambig_automatic_page is not None:
            if automatic_page != ambig_automatic_page:
                # Use automatic_page, but emit a warning
                automatic_error = f'Ambiguity Warning: {automatic_page} {ambig_automatic_page}'
        elif automatic_page is None and ambig_automatic_page is not None:
            # This is the safe case where we attempt to match when there isn't a match already
            automatic_page = ambig_automatic_page
        elif automatic_page is not None and ambig_automatic_page is None:
            # Do nothing here since by default automatic_page is used
            pass
        else:
            # if both are None, then there is no automatic match
            automatic_error = 'No match'

        if (annotated_page is None) and (automatic_page is None):
            match_report[qanta_id] = {
                'result': 'none',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
            if fold == GUESSER_TRAIN_FOLD or fold == BUZZER_TRAIN_FOLD:
                train_unmatched_questions.append(q)
            else:
                test_unmatched_questions.append(q)
        elif (annotated_page is not None) and (automatic_page is None):
            q['page'] = annotated_page
            match_report[qanta_id] = {
                'result': 'annotated',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
        elif (annotated_page is None) and (automatic_page is not None):
            q['page'] = automatic_page
            match_report[qanta_id] = {
                'result': 'automatic',
                'annotated_error': annotated_error,
                'automatic_error': automatic_error,
                'annotated_page': annotated_page,
                'automatic_page': automatic_page
            }
        else:
            if annotated_page == automatic_page:
                q['page'] = automatic_page
                match_report[qanta_id] = {
                    'result': 'annotated+automatic',
                    'annotated_error': annotated_error,
                    'automatic_error': automatic_error,
                    'annotated_page': annotated_page,
                    'automatic_page': automatic_page
                }
            else:
                q['page'] = annotated_page
                match_report[qanta_id] = {
                    'result': 'disagree',
                    'annotated_error': annotated_error,
                    'automatic_error': automatic_error,
                    'annotated_page': annotated_page,
                    'automatic_page': automatic_page
                }

    return {
        'train_unmatched': train_unmatched_questions,
        'test_unmatched': test_unmatched_questions,
        'match_report': match_report
    }