예제 #1
0
def prepro(ds,
           fold,
           num_tokens_per_group,
           num_paragraphs,
           pad=True,
           n_samples=None):
    fold_funcs = {
        'train': lambda: ds.get_train(),
        'dev': lambda: ds.get_dev(),
        'test': lambda: ds.get_test()
    }
    qs = fold_funcs[fold]()
    if n_samples is not None:
        qs = qs[:n_samples]
    evidence = ds.evidence

    prep = None
    extract = ExtractMultiParagraphsPerQuestion(
        MergeParagraphs(num_tokens_per_group),
        ShallowOpenWebRanker(num_paragraphs),
        prep,
        intern=True)

    answers = {}
    batches = {}
    for q in tqdm(qs, ncols=80, desc='preprocessing'):
        pre = extract.preprocess([q], evidence)
        if len(pre.data) == 0:
            continue
        assert len(pre.data) < 2
        assert q.question_id not in answers
        assert q.question_id not in batches
        mpq = pre.data[0]
        pq_batch = [
            ParagraphAndQuestion(p.get_context(), q.question, None,
                                 q.question_id, p.doc_id)
            for p in mpq.paragraphs
            # document paragraph question?
        ]
        if pad:
            for i in range(num_paragraphs - len(pq_batch)):
                pq_batch.append(
                    ParagraphAndQuestion([], q.question, None, q.question_id,
                                         None))

        answers[q.question_id] = mpq.answer_text
        batches[q.question_id] = pq_batch

    voc = {w for bs in batches.values() for b in bs for w in b.question}
    voc.update({w for bs in batches.values() for b in bs for w in b.context})
    return answers, batches, voc
예제 #2
0
def prepare_data(model, train_config, dataset_oversampling, n_processes):
    extract = ExtractMultiParagraphsPerQuestion(
        MergeParagraphs(train_config.n_tokens),
        ShallowOpenWebRanker(train_config.num_paragraphs),
        model.preprocessor, intern=True
    )
    trivia_qa_test = RandomParagraphSetDatasetBuilder(
        train_config.test_batch_size,
        "merge" if train_config.trivia_qa_mode == "merge" else "group", True,
        train_config.oversample
    )
    trivia_qa_train = StratifyParagraphSetsBuilder(
        train_config.train_batch_size,
        train_config.trivia_qa_mode == "merge",
        True, train_config.oversample
    )

    datas = []
    for name, sampling in dataset_oversampling.items():
        for s in range(sampling):
            ds = TriviaQaSpanCorpus(name)
            ds.corpus_name = ds.corpus_name + '_{}'.format(s)
            datas.append(ds)

    data = MultiDataset(datas)
    data = PreprocessedData(data, extract, trivia_qa_train, trivia_qa_test, eval_on_verified=False)
    data.preprocess(n_processes, 1000)
    return data
def main():
    data = TriviaQaOpenDataset()
    # data = TriviaQaWebDataset()
    print("Loading...")
    all_questions = data.get_dev()

    questions = [
        q for q in all_questions if any(
            len(x.answer_spans) > 0 for x in q.all_docs)
    ]
    print(
        "%d/%d (%.4f) have an answer" % (len(questions), len(all_questions),
                                         len(questions) / len(all_questions)))

    np.random.shuffle(questions)

    pre = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
                                            TopTfIdf(NltkPlusStopWords(), 20),
                                            require_an_answer=False)
    print("Done")

    out = preprocess_par(questions[:2000], data.evidence, pre, 2, 1000)

    n_counts = np.zeros(20)
    n_any = np.zeros(20)
    n_any_all = np.zeros(20)

    for q in out.data:
        for i, p in enumerate(q.paragraphs):
            n_counts[i] += 1
            n_any[i] += len(p.answer_spans) > 0

        for i, p in enumerate(q.paragraphs):
            if len(p.answer_spans) > 0:
                n_any_all[i:] += 1
                break

    print(n_any_all / out.true_len)
    print(n_any / n_counts)
    print(n_counts)
예제 #4
0
def get_questions(per_document, dataset, splitter, para_filter, preprocessor,
                  n_processes, batch_size):
    test_questions = dataset.get_dev()
    corpus = dataset.evidence
    print("Building question/paragraph pairs...")

    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter,
                                      para_filter,
                                      preprocessor,
                                      require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter,
                                                 para_filter,
                                                 preprocessor,
                                                 require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep, n_processes,
                                  1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(
                DocumentParagraphQuestion(q.question_id, p.doc_id,
                                          (p.start, p.end), q.question, p.text,
                                          ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)
    test_questions = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(batch_size, True))
    n_questions = len(questions)
    return test_questions, n_questions
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "corpus",
        choices=["en", "fr", "de", "ru", "pt", "zh", "pl", "uk", "ta"])
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    # Note I haven't tested modes other than `shared-norm` on this corpus, so
    # some things might need adjusting
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    args = parser.parse_args()
    mode = args.mode
    corpus = args.corpus

    model = get_model(100, 140, mode, WithIndicators())

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    oversample = [
        1
    ] * 2  # Sample the top two answer-containing paragraphs twice

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    data = XQADataset(corpus)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(1, 1000)

    # dump preprocessed dev data for bert
    data.cache_preprocess("dev_data_%s.pkl" % args.corpus)
예제 #6
0
파일: ablate_xqa.py 프로젝트: zengyy8/XQA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("corpus", choices=["en", "en_trans_de", "en_trans_zh"])
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    # Note I haven't tested modes other than `shared-norm` on this corpus, so
    # some things might need adjusting
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    args = parser.parse_args()
    mode = args.mode
    corpus = args.corpus

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8,
                                    "triviaqa",
                                    mode != "merge",
                                    per_doc=False)
    ]
    oversample = [
        1
    ] * 2  # Sample the top two answer-containing paragraphs twice

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    data = XQADataset(corpus)

    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=1)),
                         num_epochs=n_epochs,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=10,
                         log_period=30,
                         eval_period=1800,
                         save_period=1800,
                         best_weights=("dev", "b8/question-text-f1"),
                         eval_samples=dict(dev=None, train=6000))

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes
    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out),
                           notes)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data')
    parser.add_argument('model', help='model directory')
    parser.add_argument('-p', '--paragraph_output', type=str,
                        help="Save fine grained results for each paragraph in csv format")
    parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's"
                                                                  " most confident span for each (question, doc) pair")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--n_processes', type=int, default=None,
                        help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with")
    parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest")
    parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on")
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t', '--tokens', type=int, default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-g', '--n_paragraphs', type=int, default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes might be faster but wll take more memory")
    parser.add_argument('--max_answer_len', type=int, default=8,
                        help="Max answer span to select")
    parser.add_argument('-c', '--corpus',
                        choices=["web-dev", "web-test", "web-verified-dev", "web-train",
                                 "open-dev", "open-train"],
                        default="web-verified-dev")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)
    model = model_dir.get_model()

    if args.corpus.startswith('web'):
        dataset = TriviaQaWebDataset()
        corpus = dataset.evidence
        if args.corpus == "web-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "web-test":
            test_questions = dataset.get_test()
        elif args.corpus == "web-verified-dev":
            test_questions = dataset.get_verified()
        elif args.corpus == "web-train":
            test_questions = dataset.get_train()
        else:
            raise RuntimeError()
    else:
        dataset = TriviaQaOpenDataset()
        corpus = dataset.evidence
        if args.corpus == "open-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "open-train":
            test_questions = dataset.get_train()
        else:
            raise RuntimeError()

    splitter = MergeParagraphs(args.tokens)

    per_document = not args.corpus.startswith("open")

    filter_name = args.filter
    if filter_name is None:
        if args.corpus.startswith("open"):
            filter_name = "linear"
        else:
            filter_name = "tfidf"

    print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name,
                                                              ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    if n_questions is not None:
        test_questions.sort(key=lambda x:x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(DocumentParagraphQuestion(q.question_id, p.doc_id,
                                                 (p.start, p.end), q.question, p.text,
                                                  ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True)

    print("Done, starting eval")

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True))

    evaluation = trainer.test(model,
                             [RecordParagraphSpanPrediction(args.max_answer_len, True)],
                              {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus]

    if not all(len(x) == len(data) for x in evaluation.per_sample.values()):
        raise RuntimeError()

    df = pd.DataFrame(evaluation.per_sample)

    if args.official_output is not None:
        print("Saving question result")

        # I didn't store the unormalized filenames exactly, so unfortunately we have to reload
        # the source data to get exact filename to output an official test script
        fns = {}
        print("Loading proper filenames")
        if args.corpus == 'web-test':
            source = join(TRIVIA_QA, "qa", "web-test-without-answers.json")
        elif args.corpus == "web-dev":
            source = join(TRIVIA_QA, "qa", "web-dev.json")
        else:
            raise NotImplementedError()

        with open(join(source)) as f:
            data = json.load(f)["Data"]
        for point in data:
            for doc in point["EntityPages"]:
                filename = doc["Filename"]
                fn = join("wikipedia", filename[:filename.rfind(".")])
                fn = normalize_wiki_filename(fn)
                fns[(point["QuestionId"], fn)] = filename

        answers = {}
        scores = {}
        for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end",
                                                        "text_answer", "predicted_score"]].itertuples(index=False):
            filename = dataset.evidence.file_id_map[doc_id]
            if filename.startswith("web"):
                true_name = filename[4:] + ".txt"
            else:
                true_name = fns[(q_id, filename)]

            key = q_id + "--" + true_name
            prev_score = scores.get(key)
            if prev_score is None or prev_score < score:
                scores[key] = score
                answers[key] = txt

        with open(args.official_output, "w") as f:
            json.dump(answers, f)

    if per_document:
        group_by = ["question_id", "doc_id"]
    else:
        group_by = ["question_id"]

    # Print a table of scores as more paragraphs are used
    df.sort_values(group_by + ["rank"], inplace=True)
    f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by)
    em = compute_model_scores(df, "predicted_score", "text_em", group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1)))
    print_table(table)

    output_file = args.paragraph_output
    if output_file is not None:
        print("Saving paragraph result")
        if output_file.endswith("json"):
            with open(output_file, "w") as f:
                json.dump(evaluation.per_sample, f)
        elif output_file.endswith("pkl"):
            with open(output_file, "wb") as f:
                pickle.dump(evaluation.per_sample, f)
        elif output_file.endswith("csv"):

            df.to_csv(output_file, index=False)
        else:
            raise ValueError("Unrecognized file format")
예제 #8
0
파일: cache_test.py 프로젝트: zengyy8/XQA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_processes',
        type=int,
        default=1,
        help=
        "Number of processes to do the preprocessing (selecting paragraphs+loading context) with"
    )
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t',
                        '--tokens',
                        type=int,
                        default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="Number of questions to evaluate on")
    parser.add_argument('-g',
                        '--n_paragraphs',
                        type=int,
                        default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f',
                        '--filter',
                        type=str,
                        default=None,
                        choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument(
        '-c',
        '--corpus',
        choices=[
            "en_dev", "en_test", "fr_dev", "fr_test", "de_dev", "de_test",
            "ru_dev", "ru_test", "pt_dev", "pt_test", "zh_dev", "zh_test",
            "pl_dev", "pl_test", "uk_dev", "uk_test", "ta_dev", "ta_test",
            "fr_trans_en_dev", "fr_trans_en_test", "de_trans_en_dev",
            "de_trans_en_test", "ru_trans_en_dev", "ru_trans_en_test",
            "pt_trans_en_dev", "pt_trans_en_test", "zh_trans_en_dev",
            "zh_trans_en_test", "pl_trans_en_dev", "pl_trans_en_test",
            "uk_trans_en_dev", "uk_trans_en_test", "ta_trans_en_dev",
            "ta_trans_en_test"
        ],
        required=True)
    args = parser.parse_args()

    corpus_name = args.corpus[:args.corpus.rfind("_")]
    eval_set = args.corpus[args.corpus.rfind("_") + 1:]
    dataset = XQADataset(corpus_name)
    if eval_set == "dev":
        test_questions = dataset.get_dev()
    elif eval_set == "test":
        test_questions = dataset.get_test()
    else:
        raise AssertionError()

    corpus = dataset.evidence
    splitter = MergeParagraphs(args.tokens)

    per_document = args.corpus.startswith(
        "web")  # wiki and web are both multi-document

    filter_name = args.filter
    if filter_name is None:
        # Pick default depending on the kind of data we are using
        if per_document:
            filter_name = "tfidf"
        else:
            filter_name = "linear"

    print("Selecting %d paragraphs using method \"%s\" per %s" %
          (args.n_paragraphs, filter_name,
           ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True),
                               args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    if n_questions is not None:
        test_questions.sort(key=lambda x: x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    preprocessor = WithIndicators()
    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter,
                                      para_filter,
                                      preprocessor,
                                      require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter,
                                                 para_filter,
                                                 preprocessor,
                                                 require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep,
                                  args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(
                DocumentParagraphQuestion(q.question_id, p.doc_id,
                                          (p.start, p.end), q.question, p.text,
                                          ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)

    # dump eval data for bert
    import pickle
    pickle.dump(questions,
                open("%s_%d.pkl" % (args.corpus, args.n_paragraphs), "wb"))
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on TriviaQA data')
    parser.add_argument('model', help='model directory')
    parser.add_argument(
        '-p',
        '--paragraph_output',
        type=str,
        help="Save fine grained results for each paragraph in csv format")
    parser.add_argument('-o',
                        '--official_output',
                        type=str,
                        help="Build an offical output file with the model's"
                        " most confident span for each (question, doc) pair")
    parser.add_argument('--no_ema',
                        action="store_true",
                        help="Don't use EMA weights even if they exist")
    parser.add_argument(
        '--n_processes',
        type=int,
        default=None,
        help=
        "Number of processes to do the preprocessing (selecting paragraphs+loading context) with"
    )
    parser.add_argument('-i',
                        '--step',
                        type=int,
                        default=None,
                        help="checkpoint to load, default to latest")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="Number of questions to evaluate on")
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t',
                        '--tokens',
                        type=int,
                        default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-g',
                        '--n_paragraphs',
                        type=int,
                        default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f',
                        '--filter',
                        type=str,
                        default=None,
                        choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument(
        '-b',
        '--batch_size',
        type=int,
        default=200,
        help="Batch size, larger sizes might be faster but wll take more memory"
    )
    parser.add_argument('--max_answer_len',
                        type=int,
                        default=8,
                        help="Max answer span to select")
    parser.add_argument('-c',
                        '--corpus',
                        choices=[
                            "web-dev", "web-test", "web-verified-dev",
                            "web-train", "open-dev", "open-train", "wiki-dev",
                            "wiki-test"
                        ],
                        default="web-verified-dev")
    parser.add_argument("-s",
                        "--source_dir",
                        type=str,
                        default=None,
                        help="where to take input files")
    parser.add_argument("--n_span_per_q",
                        type=int,
                        default=1,
                        help="where to take input files")
    args = parser.parse_args()

    dataset_name = args.source_dir.split('/')[-1]
    model_name = args.model.split('/')[-1]
    ElasticLogger().write_log('INFO',
                              'Start Evaluation',
                              context_dict={
                                  'model': model_name,
                                  'dataset': dataset_name
                              })

    model_dir = ModelDir(args.model)
    model = model_dir.get_model()

    if args.corpus.startswith('web'):
        dataset = TriviaQaWebDataset()
        if args.corpus == "web-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "web-test":
            test_questions = dataset.get_test()
        elif args.corpus == "web-verified-dev":
            test_questions = dataset.get_verified()
        elif args.corpus == "web-train":
            test_questions = dataset.get_train()
        else:
            raise AssertionError()
    elif args.corpus.startswith("wiki"):
        dataset = TriviaQaWikiDataset()
        if args.corpus == "wiki-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "wiki-test":
            test_questions = dataset.get_test()
        else:
            raise AssertionError()
    else:
        dataset = TriviaQaOpenDataset(args.source_dir)
        if args.corpus == "open-dev":
            # just loading the pkl that was saved in build_span_corpus
            test_questions = dataset.get_dev()
        elif args.corpus == "open-train":
            test_questions = dataset.get_train()
        else:
            raise AssertionError()

    ### ALON debuging
    #test_questions = test_questions[0:5]

    corpus = dataset.evidence
    splitter = MergeParagraphs(args.tokens)

    per_document = args.corpus.startswith(
        "web")  # wiki and web are both multi-document
    #per_document = True

    filter_name = args.filter
    if filter_name is None:
        # Pick default depending on the kind of data we are using
        if per_document:
            filter_name = "tfidf"
        else:
            filter_name = "linear"

    print("Selecting %d paragraphs using method \"%s\" per %s" %
          (args.n_paragraphs, filter_name,
           ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True),
                               args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    docqa.config.SPANS_PER_QUESTION = args.n_span_per_q
    #n_questions = 1
    if n_questions is not None:
        test_questions.sort(key=lambda x: x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter,
                                      para_filter,
                                      model.preprocessor,
                                      require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter,
                                                 para_filter,
                                                 model.preprocessor,
                                                 require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep,
                                  args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(
                DocumentParagraphQuestion(q.question_id, p.doc_id,
                                          (p.start, p.end), q.question, p.text,
                                          ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)

    print("Done, starting eval")

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    test_questions = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    evaluation = trainer.test(
        model, [RecordParagraphSpanPrediction(args.max_answer_len, True)],
        {args.corpus: test_questions}, ResourceLoader(), checkpoint,
        not args.no_ema, args. async)[args.corpus]

    if not all(len(x) == len(data) for x in evaluation.per_sample.values()):
        raise RuntimeError()

    df = pd.DataFrame(evaluation.per_sample)

    if args.official_output is not None:
        print("Saving question result")

        fns = {}
        if per_document:
            # I didn't store the unormalized filenames exactly, so unfortunately we have to reload
            # the source data to get exact filename to output an official test script
            print("Loading proper filenames")
            if args.corpus == 'web-test':
                source = join(TRIVIA_QA, "qa", "web-test-without-answers.json")
            elif args.corpus == "web-dev":
                source = join(TRIVIA_QA, "qa", "web-dev.json")
            else:
                raise AssertionError()

            with open(join(source)) as f:
                data = json.load(f)["Data"]
            for point in data:
                for doc in point["EntityPages"]:
                    filename = doc["Filename"]
                    fn = join("wikipedia", filename[:filename.rfind(".")])
                    fn = normalize_wiki_filename(fn)
                    fns[(point["QuestionId"], fn)] = filename

        answers = {}
        scores = {}
        for q_id, doc_id, start, end, txt, score in df[[
                "question_id", "doc_id", "para_start", "para_end",
                "text_answer", "predicted_score"
        ]].itertuples(index=False):
            filename = dataset.evidence.file_id_map[doc_id]
            if per_document:
                if filename.startswith("web"):
                    true_name = filename[4:] + ".txt"
                else:
                    true_name = fns[(q_id, filename)]
                # Alon Patch for triviaqa test results
                true_name = true_name.replace('TriviaQA_Org/', '')
                key = q_id + "--" + true_name
            else:
                key = q_id

            prev_score = scores.get(key)
            if prev_score is None or prev_score < score:
                scores[key] = score
                answers[key] = txt

        with open(args.official_output, "w") as f:
            json.dump(answers, f)

    output_file = args.paragraph_output
    if output_file is not None:
        print("Saving paragraph result")
        df.to_csv(output_file, index=False)

    print("Computing scores")

    if per_document:
        group_by = ["question_id", "doc_id"]
    else:
        group_by = ["question_id"]

    # Print a table of scores as more paragraphs are used
    df.sort_values(group_by + ["rank"], inplace=True)
    df_scores = df.copy(deep=True)
    df_scores['predicted_score'] = df_scores['predicted_score'].apply(
        lambda x: pd.Series(x).max())

    em = compute_ranked_scores(df_scores, "predicted_score", "text_em",
                               group_by)
    f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1",
                               group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i + 1), "%.4f" % e, "%.4f" % f]
                  for i, (e, f) in enumerate(zip(em, f1)))

    table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'],
                                                              axis=1)
    ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \
                                                            'max_EM':table_df.max().ix['EM'], \
                                                            'max_F1':table_df.max().ix['F1'], \
                                                            'result_table': str(table_df)})

    df_flat = []
    for id, question in df.iterrows():
        for text_answer, predicted_span, predicted_score in zip(
                question['text_answer'], question['predicted_span'],
                question['predicted_score']):
            new_question = dict(question.copy())
            new_question.update({
                'text_answer': text_answer,
                'predicted_span': predicted_span,
                'predicted_score': predicted_score
            })
            df_flat.append(new_question)

    results_df = pd.DataFrame(df_flat)
    #Alon: outputing the estimates for all the
    #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True)
    results_df.sort_values(by=['question_id', 'predicted_score'],
                           ascending=False).set_index([
                               'question_id', 'text_answer'
                           ])[['question', 'predicted_score',
                               'text_em']].to_csv('results.csv')

    print_table(table)
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on TriviaQA unfiltered')
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    parser.add_argument("-s",
                        "--source_dir",
                        type=str,
                        default=None,
                        help="where to take input files")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=None,
                        help="Max number of epoches to train on ")
    parser.add_argument("--char_th",
                        type=int,
                        default=None,
                        help="char level embeddings")
    parser.add_argument("--hl_dim",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--regularization",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--LR",
                        type=float,
                        default=1.0,
                        help="hidden layer dim size")
    parser.add_argument("--save_every",
                        type=int,
                        default=1800,
                        help="save period")

    parser.add_argument("--init_from",
                        type=str,
                        default=None,
                        help="model to init from")
    args = parser.parse_args()
    mode = args.mode

    #out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")
    out = join('models', args.name)

    char_th = 100
    hl_dim = 140
    if args.char_th is not None:
        print(args.char_th)
        char_th = int(args.char_th)
        out += '--th' + str(char_th)
    if args.hl_dim is not None:
        print(args.hl_dim)
        hl_dim = int(args.hl_dim)
        out += '--hl' + str(hl_dim)

    if args.init_from is None:
        model = get_model(char_th, hl_dim, mode, WithIndicators())
    else:
        md = model_dir.ModelDir(args.init_from)
        model = md.get_model()

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8,
                                    "triviaqa",
                                    mode != "merge",
                                    per_doc=False)
    ]
    oversample = [1] * 4

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    if args.n_epochs is not None:
        n_epochs = args.n_epochs
        out += '--' + str(n_epochs)

    if args.LR != 1.0:
        out += '--' + str(args.LR)

    data = TriviaQaOpenDataset(args.source_dir)

    async_encoding = 10
    #async_encoding = 0
    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=args.LR)),
                         num_epochs=n_epochs,
                         num_of_steps=250000,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=async_encoding,
                         log_period=30,
                         eval_period=1800,
                         save_period=args.save_every,
                         eval_samples=dict(dev=None, train=6000),
                         regularization_weight=None)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes

    if args.init_from is not None:
        init_from = model_dir.ModelDir(args.init_from).get_best_weights()
        if init_from is None:
            init_from = model_dir.ModelDir(
                args.init_from).get_latest_checkpoint()
    else:
        init_from = None

    trainer.start_training(data,
                           model,
                           params,
                           eval,
                           model_dir.ModelDir(out),
                           notes,
                           initialize_from=init_from)
예제 #11
0
    parser.add_argument('--input_file',
                        required=True,
                        help="input file, e.g. train_data.pkl")
    parser.add_argument('--output_train_file',
                        required=True,
                        help="output train file, e.g. train_output.json")
    parser.add_argument('--num_epoch',
                        required=True,
                        type=int,
                        help="num_epoch, e.g. 10")
    args = parser.parse_args()

    mode = "shared-norm"
    model = get_model(100, 140, mode, WithIndicators())
    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    oversample = [1] * 2
    train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample)
    test = RandomParagraphSetDatasetBuilder(
        120, "merge" if mode == "merge" else "group", True, oversample)

    data = PreprocessedData(None, extract, train, test, eval_on_verified=False)
    data.load_preprocess(args.input_file)

    outputs = []
    training_data = data.get_train()
    for i in range(args.num_epoch):
        for batch in training_data.get_epoch():
            last_qid = None