예제 #1
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [
                q for q in json.load(f)['questions']
                if q['fold'] == GUESSER_TRAIN_FOLD
            ]

        guess_train, guess_val = train_test_split(all_guess_train,
                                                  random_state=42,
                                                  train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [
                q for q in json.load(f)['questions']
                if q['fold'] == GUESSER_DEV_FOLD
            ]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f)
예제 #2
0
파일: pipeline.py 프로젝트: Pinafore/qb
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [q for q in json.load(f)['questions'] if q['page'] is not None]
        train_questions = [q for q in questions if 'train' in q['fold']]
        dev_questions = [q for q in questions if 'dev' in q['fold']]
        test_questions = [q for q in questions if 'test' in q['fold']]

        with open(QANTA_TRAIN_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)
예제 #3
0
파일: command.py 프로젝트: ymedhat95/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)['questions']

    with open('data/external/high_school_project/quizdb-20190313164802.json') as f:
        raw_questions = json.load(f)['data']['tossups']

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            'qanta_id': idx,
            'text': q['text'],
            'answer': q['answer'],
            'page': None,
            'category': None,
            'subcategory': None,
            'tournament': q['tournament']['name'],
            'difficulty': q['tournament']['difficulty'],
            'year': int(q['tournament']['year']),
            'proto_id': None,
            'qdb_id': q['id'],
            'dataset': 'quizdb.org',
            'fold': 'guesstest'
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions)
    with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f:
        json.dump(report, f)

    write_answer_map(
        answer_map, amb_answer_map, unbound_answers,
        'data/external/high_school_project/answer_map.json',
        'data/external/high_school_project/unbound_answers.json'
    )
    with open('data/internal/page_assignment/unmappable.yaml') as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(
        new_questions,
        answer_map, amb_answer_map,
        unmappable, page_assigner
    )

    add_sentences_(new_questions)
    with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open('data/external/high_school_project/mapping_report.json', 'w') as f:
        json.dump(mapping_report, f)
예제 #4
0
def adversarial_to_json(adversarial_json, json_dir):
    from qanta.datasets.quiz_bowl import QantaDatabase
    db = QantaDatabase()
    lookup = {q.page.lower(): q.page for q in db.mapped_questions}
    with open(adversarial_json) as f:
        questions = json.load(f)
        rows = []
        for i, q in enumerate(questions):
            answer = q['answer'].strip().replace(' ', '_')
            if answer in lookup:
                answer = lookup[answer]
            else:
                log.warning(f'Could not find: {answer}')
            rows.append({
                'text': q['question'].strip(),
                'page': answer,
                'answer': '',
                'qanta_id': 1000000 + i,
                'proto_id': None,
                'qdb_id': None,
                'category': '',
                'subcategory': '',
                'tournament': '',
                'difficulty': '',
                'dataset': 'adversarial',
                'year': -1,
                'fold': 'expo',
                'gameplay': False
            })

    from qanta.ingestion.preprocess import add_sentences_, format_qanta_json
    from qanta.util.constants import DS_VERSION
    add_sentences_(rows, parallel=False)
    with open(path.join(json_dir, f'qanta.expo.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(rows, DS_VERSION), f)
예제 #5
0
파일: pipeline.py 프로젝트: NPSDC/qb
 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)["questions"]
     add_sentences_(qanta_questions)
     add_answer_prompts_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, "w") as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #6
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [
                q for q in json.load(f)['questions'] if q['page'] is not None
            ]
        train_questions = [q for q in questions if 'train' in q['fold']]
        dev_questions = [q for q in questions if 'dev' in q['fold']]
        test_questions = [q for q in questions if 'test' in q['fold']]

        with open(QANTA_TRAIN_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)
예제 #7
0
파일: pipeline.py 프로젝트: Pinafore/qb
    def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD]

        guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f)
예제 #8
0
파일: pipeline.py 프로젝트: NPSDC/qb
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [
                q for q in json.load(f)["questions"] if q["page"] is not None
            ]
        train_questions = [q for q in questions if "train" in q["fold"]]
        dev_questions = [q for q in questions if "dev" in q["fold"]]
        test_questions = [q for q in questions if "test" in q["fold"]]

        with open(QANTA_TRAIN_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)
예제 #9
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f:
            question_player_counts = json.load(f)
        assign_folds_(qanta_questions, question_player_counts)

        with open(QANTA_FOLDED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #10
0
파일: pipeline.py 프로젝트: Pinafore/qb
    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f:
            question_player_counts = json.load(f)
        assign_folds_(qanta_questions, question_player_counts)

        with open(QANTA_FOLDED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #11
0
파일: pipeline.py 프로젝트: Pinafore/qb
 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(
         quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH
     )
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #12
0
def format_additional():
    """
    Additional questions were added to dataset, this processes the csv version to match
    the dataset format while verifying that page info is valid.
    """
    titles_checksum = "6fa134836b3a7e3b562cdaa8ad353f2d"
    verify_checksum(
        titles_checksum, "data/external/wikipedia/wikipedia-titles.2018.04.18.json"
    )
    with open("data/external/wikipedia/wikipedia-titles.2018.04.18.json") as f:
        titles = set(json.load(f))

    trick_checksum = "905594aab776ddb10b0d7f36d30633a2"
    verify_checksum(trick_checksum, "data/external/datasets/trick-additional.csv")

    with open("data/external/datasets/trick-additional.csv") as f:
        # Ignore header row
        rows = list(csv.reader(f))[1:]

    questions = []
    for _, text, page in rows:
        page = page.replace(" ", "_")
        if page not in titles:
            log.info(f"Page not in titles: {page}")
        questions.append(
            {
                "text": text,
                "answer": page,
                "page": page,
                "fold": "advtest",
                "year": 2018,
                "dataset": "trickme",
                "proto_id": None,
                "qdb_id": None,
                "difficulty": None,
                "category": None,
                "subcategory": None,
                "qanta_id": None,
                "tournament": TOURNAMENT_DEC_15,
                "gameplay": False,
                "interface": "ir-r2",
            }
        )
    add_sentences_(questions, parallel=False)
    dataset = format_qanta_json(questions, "2018.04.18")
    dataset["dependent_checksums"] = {
        "trick-additional.csv": trick_checksum,
        "wikipedia-titles.2018.04.18.json": titles_checksum,
    }
    path_formatted = "data/external/datasets/qanta.trick-additional-ir-round2.json"
    with open(path_formatted, "w") as f:
        json.dump(dataset, f)
    log.info(f"File: {path_formatted} Checksum: {md5sum(path_formatted)}")
예제 #13
0
def format_additional():
    """
    Additional questions were added to dataset, this processes the csv version to match
    the dataset format while verifying that page info is valid.
    """
    titles_checksum = '6fa134836b3a7e3b562cdaa8ad353f2d'
    verify_checksum(
        titles_checksum,
        'data/external/wikipedia/wikipedia-titles.2018.04.18.json')
    with open('data/external/wikipedia/wikipedia-titles.2018.04.18.json') as f:
        titles = set(json.load(f))

    trick_checksum = '905594aab776ddb10b0d7f36d30633a2'
    verify_checksum(trick_checksum,
                    'data/external/datasets/trick-additional.csv')

    with open('data/external/datasets/trick-additional.csv') as f:
        # Ignore header row
        rows = list(csv.reader(f))[1:]

    questions = []
    for _, text, page in rows:
        page = page.replace(' ', '_')
        if page not in titles:
            log.info(f'Page not in titles: {page}')
        questions.append({
            'text': text,
            'answer': page,
            'page': page,
            'fold': 'advtest',
            'year': 2018,
            'dataset': 'trickme',
            'proto_id': None,
            'qdb_id': None,
            'difficulty': None,
            'category': None,
            'subcategory': None,
            'qanta_id': None,
            'tournament': TOURNAMENT_DEC_15,
            'gameplay': False,
            'interface': 'ir-r2'
        })
    add_sentences_(questions, parallel=False)
    dataset = format_qanta_json(questions, '2018.04.18')
    dataset['dependent_checksums'] = {
        'trick-additional.csv': trick_checksum,
        'wikipedia-titles.2018.04.18.json': titles_checksum
    }
    path_formatted = 'data/external/datasets/qanta.trick-additional-ir-round2.json'
    with open(path_formatted, 'w') as f:
        json.dump(dataset, f)
    log.info(f'File: {path_formatted} Checksum: {md5sum(path_formatted)}')
예제 #14
0
파일: pipeline.py 프로젝트: theJasonFan/qb
 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(
         QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(quizdb_tournaments,
                                                quizdb_categories,
                                                quizdb_subcategories,
                                                QDB_TOSSUPS_PATH)
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #15
0
파일: trickme.py 프로젝트: Pinafore/qb
def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)['questions']

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q['qanta_id'])
        if lookup[qanta_id] == 'es':
            es_questions.append(q)
        elif lookup[qanta_id] == 'rnn':
            rnn_questions.append(q)
        else:
            raise ValueError('Unhandled question source')

    with open(rnn_out, 'w') as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, 'w') as f:
        json.dump(format_qanta_json(es_questions, version), f)
예제 #16
0
def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)['questions']

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q['qanta_id'])
        if lookup[qanta_id] == 'es':
            es_questions.append(q)
        elif lookup[qanta_id] == 'rnn':
            rnn_questions.append(q)
        else:
            raise ValueError('Unhandled question source')

    with open(rnn_out, 'w') as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, 'w') as f:
        json.dump(format_qanta_json(es_questions, version), f)
예제 #17
0
def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)["questions"]

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q["qanta_id"])
        if lookup[qanta_id] == "es":
            es_questions.append(q)
        elif lookup[qanta_id] == "rnn":
            rnn_questions.append(q)
        else:
            raise ValueError("Unhandled question source")

    with open(rnn_out, "w") as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, "w") as f:
        json.dump(format_qanta_json(es_questions, version), f)
예제 #18
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(qanta_questions,
                                                      answer_map,
                                                      ambig_answer_map,
                                                      unmappable,
                                                      page_assigner)

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)
예제 #19
0
파일: pipeline.py 프로젝트: Pinafore/qb
    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(
            qanta_questions,
            answer_map, ambig_answer_map,
            unmappable, page_assigner
        )

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)
예제 #20
0
파일: trickme.py 프로젝트: theJasonFan/qb
    def parse_tossups(
            qanta_ds_path='data/external/datasets/qanta.mapped.2018.04.18.json',
            trick_path='data/external/datasets/trickme_questions_12-15-2018.json',
            start_idx=2000000,
            version='2018.04.18'):
        with open(qanta_ds_path) as f:
            qanta_ds = json.load(f)['questions']
        answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
        lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
        with open(trick_path) as f:
            questions = []
            for i, q in enumerate(json.load(f)):
                if 'Question' in q:
                    text = q['Question']
                elif 'question' in q:
                    text = q['question']
                else:
                    raise ValueError(
                        'Could not find question field in question')

                if 'Answer' in q:
                    answer = q['Answer'].replace(' ', '_')
                elif 'answer' in q:
                    answer = q['answer'].replace(' ', '_')
                else:
                    raise ValueError('Could not find answer field in question')
                if len(answer) == 0 or len(text) == 0:
                    continue
                if i in DIRECT_MAP:
                    m_ans, m_page = DIRECT_MAP[i]
                    if m_ans == answer:
                        if m_page is None:
                            continue  # Skip this explicitly
                        elif m_page in answer_set:
                            page = m_page
                        else:
                            raise ValueError(
                                f'{m_page} not in answer set\n Q: {text}')
                    else:
                        raise ValueError(f'Mapping error: {answer} != {m_ans}')
                elif answer in lookup:
                    page = lookup[answer]
                else:
                    raise ValueError(
                        f'Could not find: idx: {i} Q:"{text}" \nA: "{answer}"')
                q_out = {
                    'text': text,
                    'answer': answer,
                    'page': page,
                    'fold': 'advtest',
                    'year': 2018,
                    'dataset': 'trickme',
                    'proto_id': None,
                    'qdb_id': None,
                    'trickme_id': i,
                    'difficulty': None,
                    'category': None,
                    'subcategory': None,
                    'qanta_id': start_idx + i,
                    'tournament':
                    'Adversarial Question Writing UMD December 15',
                    'gameplay': False
                }
                if 'email' in q:
                    q_out['author_email'] = q['email']
                if 'category' in q and q['category'] != "None":
                    q_out['category'] = q['category']
                questions.append(q_out)
            add_sentences_(questions, parallel=True)
            dataset = format_qanta_json(questions, version)
            return dataset
예제 #21
0
def trick_to_ds(
    answer_map_path,
    qanta_ds_path,
    wiki_titles_path,
    trick_path,
    id_model_path,
    out_path,
    start_idx,
    version,
    fold,
    year,
    tournament,
    separate_rounds,
):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)["questions"]
    answer_set = {q["page"] for q in qanta_ds if q["page"] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(" ", "_"): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if "Question" in q:
                text = q["Question"]
            elif "question" in q:
                text = q["question"]
            else:
                raise ValueError("Could not find question field in question")

            if "Answer" in q:
                answer = q["Answer"].replace(" ", "_")
            elif "answer" in q:
                answer = q["answer"].replace(" ", "_")
            else:
                raise ValueError("Could not find answer field in question")

            if "trick_id" in q:
                trick_id = q["trick_id"]
            else:
                trick_id = None

            if len(answer) == 0:
                raise ValueError(f"Empty answer for trick_id={trick_id}")
            elif len(text) == 0:
                raise ValueError(f"Empty text for trick_id={trick_id}")

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if "model" in q:
                        log.info(
                            f'Explicitly Skipping {answer}, int-model: {q["model"]}'
                        )
                    else:
                        log.info(f"Explicitly Skipping {answer}")
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f"{m_page} not in answer set\n Q: {text}")
            else:
                log.error(
                    f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"'
                )
                skipped += 1
                continue

            q_out = {
                "text": text,
                "answer": answer,
                "page": page,
                "fold": fold,
                "year": year,
                "dataset": "trickme",
                "proto_id": None,
                "qdb_id": None,
                "difficulty": None,
                "category": None,
                "subcategory": None,
                "qanta_id": start_idx + i,
                "tournament": tournament,
                "gameplay": False,
                "trick_id": trick_id,
            }
            if "email" in q:
                q_out["author_email"] = q["email"]
            if "category" in q and q["category"] != "None":
                q_out["category"] = q["category"]
            if "round" in q:
                q_out["round"] = q["round"]
            if "model" in q:
                id_model_map[q_out["qanta_id"]] = q["model"]
            questions.append(q_out)
        log.info(f"Total: {len(questions)} Skipped: {skipped}")
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q["round"]].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split(".")
                if file_name[-1] == "json":
                    file_name.pop()
                    file_name.extend([name, "json"])
                else:
                    file_name.extend([name, "json"])
                round_out_path = ".".join(file_name)
                log.info(f"Writing round {name} to {round_out_path}")
                with open(round_out_path, "w") as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, "w") as f:
                json.dump(dataset, f)

        with open(id_model_path, "w") as f:
            json.dump(id_model_map, f)
예제 #22
0
파일: trickme.py 프로젝트: Pinafore/qb
def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path,
                id_model_path, out_path,
                start_idx, version, fold, year, tournament,
                separate_rounds):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)['questions']
    answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if 'Question' in q:
                text = q['Question']
            elif 'question' in q:
                text = q['question']
            else:
                raise ValueError('Could not find question field in question')

            if 'Answer' in q:
                answer = q['Answer'].replace(' ', '_')
            elif 'answer' in q:
                answer = q['answer'].replace(' ', '_')
            else:
                raise ValueError('Could not find answer field in question')

            if 'trick_id' in q:
                trick_id = q['trick_id']
            else:
                trick_id = None


            if len(answer) == 0:
                raise ValueError(f'Empty answer for trick_id={trick_id}')
            elif len(text) == 0:
                raise ValueError(f'Empty text for trick_id={trick_id}')

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if 'model' in q:
                        log.info(f'Explicitly Skipping {answer}, int-model: {q["model"]}')
                    else:
                        log.info(f'Explicitly Skipping {answer}')
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f'{m_page} not in answer set\n Q: {text}')
            else:
                log.error(f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"')
                skipped += 1
                continue

            q_out = {
                'text': text,
                'answer': answer,
                'page': page,
                'fold': fold,
                'year': year,
                'dataset': 'trickme',
                'proto_id': None,
                'qdb_id': None,
                'difficulty': None,
                'category': None,
                'subcategory': None,
                'qanta_id': start_idx + i,
                'tournament': tournament,
                'gameplay': False,
                'trick_id': trick_id
            }
            if 'email' in q:
                q_out['author_email'] = q['email']
            if 'category' in q and q['category'] != "None":
                q_out['category'] = q['category']
            if 'round' in q:
                q_out['round'] = q['round']
            if 'model' in q:
                id_model_map[q_out['qanta_id']] = q['model']
            questions.append(q_out)
        log.info(f'Total: {len(questions)} Skipped: {skipped}')
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q['round']].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split('.')
                if file_name[-1] == 'json':
                    file_name.pop()
                    file_name.extend([name, 'json'])
                else:
                    file_name.extend([name, 'json'])
                round_out_path = '.'.join(file_name)
                log.info(f'Writing round {name} to {round_out_path}')
                with open(round_out_path, 'w') as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, 'w') as f:
                json.dump(dataset, f)

        with open(id_model_path, 'w') as f:
            json.dump(id_model_map, f)
예제 #23
0
파일: command.py 프로젝트: NPSDC/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)["questions"]

    with open("data/external/high_school_project/quizdb-20190313164802.json"
              ) as f:
        raw_questions = json.load(f)["data"]["tossups"]

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            "qanta_id": idx,
            "text": q["text"],
            "answer": q["answer"],
            "page": None,
            "category": None,
            "subcategory": None,
            "tournament": q["tournament"]["name"],
            "difficulty": q["tournament"]["difficulty"],
            "year": int(q["tournament"]["year"]),
            "proto_id": None,
            "qdb_id": q["id"],
            "dataset": "quizdb.org",
            "fold": "guesstest",
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
        questions)
    with safe_open("data/external/high_school_project/automatic_report.json",
                   "w") as f:
        json.dump(report, f)

    write_answer_map(
        answer_map,
        amb_answer_map,
        unbound_answers,
        "data/external/high_school_project/answer_map.json",
        "data/external/high_school_project/unbound_answers.json",
    )
    with open("data/internal/page_assignment/unmappable.yaml") as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(new_questions, answer_map,
                                                  amb_answer_map, unmappable,
                                                  page_assigner)

    add_sentences_(new_questions)
    with open(
            "data/external/high_school_project/qanta.acf-regionals-2018.json",
            "w") as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open("data/external/high_school_project/mapping_report.json",
              "w") as f:
        json.dump(mapping_report, f)
예제 #24
0
def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path,
                id_model_path, out_path, start_idx, version, fold, year,
                tournament, separate_rounds):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)['questions']
    answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if 'Question' in q:
                text = q['Question']
            elif 'question' in q:
                text = q['question']
            else:
                raise ValueError('Could not find question field in question')

            if 'Answer' in q:
                answer = q['Answer'].replace(' ', '_')
            elif 'answer' in q:
                answer = q['answer'].replace(' ', '_')
            else:
                raise ValueError('Could not find answer field in question')

            if 'trick_id' in q:
                trick_id = q['trick_id']
            else:
                trick_id = None

            if len(answer) == 0:
                raise ValueError(f'Empty answer for trick_id={trick_id}')
            elif len(text) == 0:
                raise ValueError(f'Empty text for trick_id={trick_id}')

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if 'model' in q:
                        log.info(
                            f'Explicitly Skipping {answer}, int-model: {q["model"]}'
                        )
                    else:
                        log.info(f'Explicitly Skipping {answer}')
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f'{m_page} not in answer set\n Q: {text}')
            else:
                log.error(
                    f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"'
                )
                skipped += 1
                continue

            q_out = {
                'text': text,
                'answer': answer,
                'page': page,
                'fold': fold,
                'year': year,
                'dataset': 'trickme',
                'proto_id': None,
                'qdb_id': None,
                'difficulty': None,
                'category': None,
                'subcategory': None,
                'qanta_id': start_idx + i,
                'tournament': tournament,
                'gameplay': False,
                'trick_id': trick_id
            }
            if 'email' in q:
                q_out['author_email'] = q['email']
            if 'category' in q and q['category'] != "None":
                q_out['category'] = q['category']
            if 'round' in q:
                q_out['round'] = q['round']
            if 'model' in q:
                id_model_map[q_out['qanta_id']] = q['model']
            questions.append(q_out)
        log.info(f'Total: {len(questions)} Skipped: {skipped}')
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q['round']].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split('.')
                if file_name[-1] == 'json':
                    file_name.pop()
                    file_name.extend([name, 'json'])
                else:
                    file_name.extend([name, 'json'])
                round_out_path = '.'.join(file_name)
                log.info(f'Writing round {name} to {round_out_path}')
                with open(round_out_path, 'w') as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, 'w') as f:
                json.dump(dataset, f)

        with open(id_model_path, 'w') as f:
            json.dump(id_model_map, f)
예제 #25
0
def nonnaqt_to_json(csv_input, json_dir):
    question_sentences = defaultdict(list)
    with open(csv_input) as f:
        csv_rows = list(csv.reader(f))
        for r in csv_rows[1:]:
            if len(r) != 5:
                raise ValueError('Invalid csv row, must have 5 columns')
            qnum, sent, text, page, fold = r
            qnum = int(qnum)
            sent = int(sent)
            question_sentences[qnum].append({
                'qnum': qnum,
                'sent': sent,
                'text': text,
                'page': page,
                'fold': fold
            })

    questions = []
    for sentences in tqdm.tqdm(question_sentences.values()):
        ordered_sentences = sorted(sentences, key=lambda s: s['sent'])
        text = ' '.join(s['text'] for s in ordered_sentences)
        tokenizations = []
        position = 0
        for i in range(len(ordered_sentences)):
            sent = ordered_sentences[i]['text']
            length = len(sent)
            tokenizations.append((position, position + length))
            position += length + 1
        q = ordered_sentences[0]
        questions.append({
            'answer': '',
            'category': '',
            'subcategory': '',
            'tournament': '',
            'year': -1,
            'dataset': 'non_naqt',
            'difficulty': '',
            'first_sentence': ordered_sentences[0]['text'],
            'qanta_id': q['qnum'],
            'fold': q['fold'],
            'gameplay': False,
            'page': q['page'],
            'proto_id': None,
            'qdb_id': None,
            'text': text,
            'tokenizations': tokenizations
        })

    train_questions = [q for q in questions if q['fold'] == 'guesstrain']
    dev_questions = [q for q in questions if q['fold'] == 'guessdev']
    test_questions = [q for q in questions if q['fold'] == 'test']
    for q in test_questions:
        q['fold'] = 'guesstest'

    from qanta.ingestion.preprocess import format_qanta_json
    from qanta.util.constants import DS_VERSION

    with open(path.join(json_dir, f'qanta.mapped.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.train.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(train_questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.dev.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.test.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(test_questions, DS_VERSION), f)

    from sklearn.model_selection import train_test_split
    guess_train, guess_val = train_test_split(train_questions,
                                              random_state=42,
                                              train_size=.9)
    with open(path.join(json_dir, f'qanta.torchtext.train.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(guess_train, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.torchtext.val.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(guess_val, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.torchtext.dev.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(dev_questions, DS_VERSION), f)
예제 #26
0
파일: pipeline.py 프로젝트: theJasonFan/qb
 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)['questions']
     add_sentences_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
예제 #27
0
파일: pipeline.py 프로젝트: Pinafore/qb
 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)['questions']
     add_sentences_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)