예제 #1
0
파일: command.py 프로젝트: ymedhat95/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)['questions']

    with open('data/external/high_school_project/quizdb-20190313164802.json') as f:
        raw_questions = json.load(f)['data']['tossups']

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            'qanta_id': idx,
            'text': q['text'],
            'answer': q['answer'],
            'page': None,
            'category': None,
            'subcategory': None,
            'tournament': q['tournament']['name'],
            'difficulty': q['tournament']['difficulty'],
            'year': int(q['tournament']['year']),
            'proto_id': None,
            'qdb_id': q['id'],
            'dataset': 'quizdb.org',
            'fold': 'guesstest'
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions)
    with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f:
        json.dump(report, f)

    write_answer_map(
        answer_map, amb_answer_map, unbound_answers,
        'data/external/high_school_project/answer_map.json',
        'data/external/high_school_project/unbound_answers.json'
    )
    with open('data/internal/page_assignment/unmappable.yaml') as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(
        new_questions,
        answer_map, amb_answer_map,
        unmappable, page_assigner
    )

    add_sentences_(new_questions)
    with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open('data/external/high_school_project/mapping_report.json', 'w') as f:
        json.dump(mapping_report, f)
예제 #2
0
def main():
    titles = read_wiki_titles()
    assigner = PageAssigner()
    log.info('Checking direct protobowl mappings...')
    for page in assigner.protobowl_direct.values():
        check_page(page, titles)

    log.info('Checking direct quizdb mappings...')
    for page in assigner.quizdb_direct.values():
        check_page(page, titles)

    log.info('Checking unambiguous mappings...')
    for page in assigner.unambiguous.values():
        check_page(page, titles)

    log.info('Checking ambiguous mappings...')
    for entry in assigner.ambiguous.values():
        for option in entry:
            check_page(option['page'], titles)
예제 #3
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(qanta_questions,
                                                      answer_map,
                                                      ambig_answer_map,
                                                      unmappable,
                                                      page_assigner)

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)
예제 #4
0
파일: command.py 프로젝트: NPSDC/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)["questions"]

    with open("data/external/high_school_project/quizdb-20190313164802.json"
              ) as f:
        raw_questions = json.load(f)["data"]["tossups"]

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            "qanta_id": idx,
            "text": q["text"],
            "answer": q["answer"],
            "page": None,
            "category": None,
            "subcategory": None,
            "tournament": q["tournament"]["name"],
            "difficulty": q["tournament"]["difficulty"],
            "year": int(q["tournament"]["year"]),
            "proto_id": None,
            "qdb_id": q["id"],
            "dataset": "quizdb.org",
            "fold": "guesstest",
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
        questions)
    with safe_open("data/external/high_school_project/automatic_report.json",
                   "w") as f:
        json.dump(report, f)

    write_answer_map(
        answer_map,
        amb_answer_map,
        unbound_answers,
        "data/external/high_school_project/answer_map.json",
        "data/external/high_school_project/unbound_answers.json",
    )
    with open("data/internal/page_assignment/unmappable.yaml") as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(new_questions, answer_map,
                                                  amb_answer_map, unmappable,
                                                  page_assigner)

    add_sentences_(new_questions)
    with open(
            "data/external/high_school_project/qanta.acf-regionals-2018.json",
            "w") as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open("data/external/high_school_project/mapping_report.json",
              "w") as f:
        json.dump(mapping_report, f)