Пример #1
0
def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    parser.add_argument("--weighted-questions", action='store_true')

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_known_args()[0]
    tokenizer = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(
        parse_squad_data(args.train_file,
                         "train",
                         tokenizer,
                         weighted_samples=args.weighted_questions))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenizer))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")
Пример #2
0
def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #basedir = join(expanduser("~"), "data", "squad")
    basedir = join(expanduser("~"), "azayats", "data", "squad")
    parser.add_argument("--train_file",
                        default=join(basedir, "train-v1.1.json"))
    parser.add_argument("--dev_file", default=join(basedir, "dev-v1.1.json"))

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args()
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")
Пример #3
0
def main():
    data = SquadCorpus()

    string_f1 = 0
    mapped_string_f1 = 0

    docs = data.get_train()
    n_questions = 0

    for doc in tqdm(docs):
        for para in doc.paragraphs:
            words = flatten_iterable(para.text)
            for question in para.questions:
                n_questions += 1
                span_answer = question.answer[0]
                span_str = " ".join(
                    words[span_answer.
                          para_word_start:span_answer.para_word_end + 1])
                raw_answer = span_answer.text
                mapped_str = para.get_original_text(
                    span_answer.para_word_start, span_answer.para_word_end)

                string_f1 += f1_score(raw_answer, span_str)
                mapped_string_f1 += f1_score(raw_answer, mapped_str)

    print(string_f1 / n_questions)
    print(mapped_string_f1 / n_questions)
Пример #4
0
def main():
    #Namespace(directory= 'C:/Users/boidiyv/document-qa-master',dump=False, fake=False, verbose=False)
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #parser = argparse.ArgumentParser()

    parser.add_argument('--document-qa/docqa/squad', type=Path)
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    #parser.add_argument("--document-qa-master",type=lambda p: Path(p).absolute(),default=Path(__file__).absolute().parent / "document-qa-master",help="Path to the data directory" )
    
    
    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args('')
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))
    print(train)

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")
Пример #5
0
def show_unk(corpus: SquadCorpus,
             vec_name: str,
             context: bool = True,
             question: bool = True):
    vecs = corpus.get_pruned_word_vecs(vec_name)
    docs = corpus.get_train()

    lower_unk = Counter()
    unk = Counter()

    for doc in docs:
        for para in doc.paragraphs:
            if context:
                for sent in para.text:
                    for word in sent:
                        if word not in vecs:
                            unk[word] += 1
                        word = word.lower()
                        if word not in vecs:
                            lower_unk[word] += 1
            if question:
                for question in para.questions:
                    for word in question.words:
                        if word not in vecs:
                            unk[word] += 1
                        word = word.lower()
                        if word not in vecs:
                            lower_unk[word] += 1

    print("\n".join("%s: %d" % (k, v) for k, v in lower_unk.most_common()))
Пример #6
0
def show_squad_errors(answers):
    print("Loading answers..")
    answer_df = pd.read_csv(answers)

    print("Loading questions..")
    corpus = SquadCorpus()
    questions = {}
    docs = {}
    for doc in corpus.get_dev():
        for para in doc.paragraphs:
            for q in para.questions:
                questions[q.question_id] = q
                docs[q.question_id] = doc

    answer_df.sort_values(["question_id", "rank"], inplace=True)
    grouped = list(answer_df.groupby(["question_id"]))
    np.random.shuffle(grouped)

    for question_id, group in grouped:
        q = questions[question_id]
        doc = docs[question_id]
        cur_best_score = group.text_f1.iloc[0]
        cur_best_conf = group.predicted_score.iloc[0]
        cur_best_ix = group.index[0]
        for i in range(1, len(group)):
            ix = group.index[i]
            conf = group.predicted_score[ix]
            if conf > cur_best_conf:
                score = group.text_f1[ix]
                if score < cur_best_score:
                    # We hurt our selves!
                    print("Oh no!")
                    print(" ".join(q.words))
                    print(q.answer.answer_text)
                    print("Best score was %.4f (conf=%.4f), but not is %.4f (conf=%.4f)" % (
                        cur_best_score, cur_best_conf, score, conf
                    ))
                    cur_para = doc.paragraphs[group.para_number[cur_best_ix]]
                    new_para = doc.paragraphs[group.para_number[ix]]

                    p1_s, p1_e = group.predicted_start[cur_best_ix], group.predicted_end[cur_best_ix]
                    p2_s, p2_e = group.predicted_start[ix], group.predicted_end[ix]

                    print(" ".join(display_para(cur_para.get_context(), None, q.words, p1_s, p1_e)))
                    print()
                    print(" ".join(display_para(new_para.get_context(), None, q.words, p2_s, p2_e)))
                    input()
                else:
                    cur_best_score = score
                    cur_best_ix = ix
                    cur_best_conf = conf
Пример #7
0
def show_in_context_unks(corpus: SquadCorpus, vec_name):
    data = corpus.get_train()
    np.random.shuffle(data)
    vecs = corpus.get_pruned_word_vecs(vec_name)

    for doc in data:
        paragraphs = list(doc.paragraphs)
        np.random.shuffle(paragraphs)
        for para in paragraphs:
            sentences = list(para.text) + [x.words for x in para.questions]
            np.random.shuffle(sentences)
            for words in sentences:
                for i, word in enumerate(words):
                    if word.lower() not in vecs:
                        words[i] = "{{{" + word + "}}}"
                        print(" ".join(words[max(0, i -
                                                 10):min(len(words), i + 10)]))
                        words[i] = word
Пример #8
0
def main():
    corpus = SquadCorpus()
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True)
    orig_data = corpus.get_train(
    ) if OPTS.split == 'train' else corpus.get_dev()
    orig_lens = [
        len(p.text[0]) for doc in orig_data for p in doc.paragraphs
        for q in p.questions
    ]
    new_data = preprocess_par(orig_data,
                              corpus.evidence,
                              prepro,
                              n_processes=1)
    new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
    print('%d original, mean %.2f words' %
          (len(orig_lens), np.mean(orig_lens)))
    print('%d new, mean %.2f words' % (len(new_lens), np.mean(new_lens)))
    if OPTS.out_file:
        write_output(OPTS.split, new_data, OPTS.out_file)
Пример #9
0
def main():
  corpus = SquadCorpus()
  if OPTS.normalize_before_ranking:
      normalizer = WordNormalizer()
  else:
      normalizer = None
  if OPTS.use_vec_dist:
    word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d')
    prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer)
  else:
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer)
  orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev()
  orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs
               for q in p.questions] 
  new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1)
  new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
  print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens)))
  print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens)))
  if OPTS.out_file:
    write_output(OPTS.split, new_data, OPTS.out_file)
Пример #10
0
def show_features(corpus: SquadCorpus, vec_name):
    print("Loading train docs")
    data = corpus.get_train()
    np.random.shuffle(data)
    data = data[:100]

    print("Loading vectors")
    vecs = corpus.get_pruned_word_vecs(vec_name)
    fe = BasicWordFeatures()

    grouped_by_features = defaultdict(Counter)

    print("start")

    for doc in data:
        paragraphs = list(doc.paragraphs)
        np.random.shuffle(paragraphs)
        for para in paragraphs:
            sentences = list(para.text) + [x.words for x in para.questions]
            np.random.shuffle(sentences)
            for words in sentences:
                for i, word in enumerate(words):
                    if word.lower() not in vecs:
                        x = fe.get_word_features(word)
                        for i, val in enumerate(x):
                            if val > 0:
                                grouped_by_features[i][word] += 1

    for i in sorted(grouped_by_features.keys()):
        name = BasicWordFeatures.features_names[i]
        if name in ["Len"]:
            continue
        vals = grouped_by_features[i]
        print()
        print("*" * 30)
        print("%s-%d %d (%d)" % (name, i, len(vals), sum(vals.values())))
        for k, v in vals.most_common(30):
            print("%s: %d" % (k, v))
Пример #11
0
def show_nums(corpus: SquadCorpus):
    n_regex = re.compile(".*[0-9].*")
    data = corpus.get_train()
    np.random.shuffle(data)

    for doc in data:
        paragraphs = list(doc.paragraphs)
        np.random.shuffle(paragraphs)
        for para in paragraphs:
            sentences = list(para.context) + [x.words for x in para.questions]
            np.random.shuffle(sentences)
            for words in sentences:
                for i, word in enumerate(words):
                    if n_regex.match(word) is not None:
                        print(word)
def check_preprocess_squad():
    data = SquadCorpus().get_train()
    remove_cross = WithIndicators(True)

    for doc in tqdm(data):
        for para in doc.paragraphs:
            q = para.questions[np.random.randint(0, len(para.questions))]

            text, ans, inv = remove_cross.encode_paragraph(q.words, para.text, para.paragraph_num == 0,
                                       q.answer.answer_spans, para.spans)
            if len(inv) != len(text):
                raise ValueError()
            for i in range(len(inv)-1):
                if inv[i, 0] > inv[i+1, 0]:
                    raise ValueError()
            for (s1, e1), (s2, e2) in zip(ans, q.answer.answer_spans):
                if tuple(inv[s1]) != tuple(para.spans[s2]):
                    raise ValueError()
                if tuple(inv[e1]) != tuple(para.spans[e2]):
                    raise ValueError()
def build_corpus_subset(output):
    docs = SquadCorpus().get_dev()
    titles = [clean_title(doc.title) for doc in docs]
    for i, t in enumerate(titles):
        if t == "Sky (United Kingdom)":
            titles[i] = "Sky UK"

    with sqlite3.connect(DOCUMENT_READER_DB) as conn:
        c = conn.cursor()

        c.execute("CREATE TEMPORARY TABLE squad_docs(id)")
        c.executemany("INSERT INTO squad_docs VALUES (?)",
                      [(x, ) for x in titles])

        c.execute("ATTACH DATABASE ? AS db2", (output, ))
        c.execute("CREATE TABLE db2.documents (id PRIMARY KEY, text);")

        c.execute(
            "INSERT INTO db2.documents SELECT * FROM documents WHERE id in squad_docs"
        )
        c.close()
Пример #14
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level SQuAD')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    args = parser.parse_args()
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        # Run in the "standard" known-paragraph setting
        if model.preprocessor is not None:
            raise NotImplementedError()
        n_epochs = 26
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching)
        eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphsBuilder(train_batching, 1),
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphSetsBuilder(25, args.mode == "merge", True,
                                             1),
                eval_dataset,
                eval_on_verified=False,
            )

        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)
Пример #15
0
def main():
    show_unk(SquadCorpus(), "glove.840B.300d")
def main():
    parser = argparse.ArgumentParser("Train rejection model on SQuAD")

    parser.add_argument("--corpus_dir", type=str, default="~/data/document-qa")
    parser.add_argument("--output_dir",
                        type=str,
                        default="~/model/document-qa/squad")
    parser.add_argument("--lm_dir", type=str, default="~/data/lm")
    parser.add_argument("--exp_id", type=str, default="rejection")

    parser.add_argument("--lr", type=float, default=0.5)
    parser.add_argument("--epoch", type=int, default=20)

    parser.add_argument("--dim", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=45)

    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")

    args = parser.parse_args()

    print("Arguments : ", args)

    out = args.output_dir + "_" + args.exp_id + "_lr" + str(
        args.lr) + "-" + datetime.now().strftime("%m%d-%H%M%S")
    dim = args.dim
    batch_size = args.batch_size
    out = expanduser(out)
    lm_dir = expanduser(args.lm_dir)
    corpus_dir = expanduser(args.corpus_dir)

    print("Make global recurrent_layer...")
    recurrent_layer = CudnnGru(
        dim, w_init=tf.keras.initializers.TruncatedNormal(stddev=0.05))
    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=args.lr)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=args.epoch,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )

    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        lm_model=SquadContextConcatSkip(lm_dir=lm_dir),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer)))

    batcher = ClusteredBatcher(batch_size, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(corpus_dir), None, batcher,
                                  batcher)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
Пример #17
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str,
                        help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    # Add ja_test choice to test Multilingual QA dataset.
    parser.add_argument(
        '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev")
    parser.add_argument('--no_ema', action="store_true",
                        help="Don't use EMA weights even if they exist")
    # Add ja_test choice to test Multilingual QA pipeline.
    parser.add_argument('-p', '--pred_filepath', default=None,
                        help="The csv file path if you try pred mode")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    # Add ja_test choice to test Multilingual QA pipeline.
    elif args.corpus == "ja_test":
        questions = corpus.get_ja_test()
    # This is for prediction mode for MLQA pipeline.
    elif args.corpus == "pred":
        questions = create_pred_dataset(args.pred_filepath)
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(
            sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x: x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()

    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on document-level SQuAD')
    parser.add_argument('model', help='model to use')
    parser.add_argument(
        'output',
        type=str,
        help="Store the per-paragraph results in csv format in this file")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="(for testing) sample documents")
    parser.add_argument(
        '-s',
        '--async',
        type=int,
        default=10,
        help="Encoding batch asynchronously, queueing up to this many")
    parser.add_argument('-a',
                        '--answer_bound',
                        type=int,
                        default=17,
                        help="Max answer span length")
    parser.add_argument('-p',
                        '--n_paragraphs',
                        type=int,
                        default=None,
                        help="Max number of paragraphs to use")
    parser.add_argument(
        '-b',
        '--batch_size',
        type=int,
        default=200,
        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-c',
                        '--corpus',
                        choices=["dev", "train", "doc-rd-dev"],
                        default="dev")
    parser.add_argument('--no_ema',
                        action="store_true",
                        help="Don't use EMA weights even if they exist")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)
    print("Loading data")

    questions = []
    ranker = SquadTfIdfRanker(NltkPlusStopWords(True),
                              args.n_paragraphs,
                              force_answer=False)

    if args.corpus == "doc-rd-dev":
        docs = SquadCorpus().get_dev()
        if args.n_sample is not None:
            docs.sort(key=lambda x: x.doc_id)
            np.random.RandomState(0).shuffle(docs)
            docs = docs[:args.n_sample]

        print("Fetching document reader docs...")
        doc_rd_versions = get_doc_rd_doc(docs)
        print("Ranking and matching with questions...")
        for doc in tqdm(docs):
            doc_questions = flatten_iterable(x.questions
                                             for x in doc.paragraphs)
            paragraphs = doc_rd_versions[doc.title]
            ranks = ranker.rank([x.words for x in doc_questions],
                                [x.text for x in paragraphs])
            for i, question in enumerate(doc_questions):
                para_ranks = np.argsort(ranks[i])
                for para_rank, para_num in enumerate(
                        para_ranks[:args.n_paragraphs]):
                    # Just use dummy answers spans for these pairs
                    questions.append(
                        RankedParagraphQuestion(
                            question.words,
                            TokenSpans(question.answer.answer_text,
                                       np.zeros((0, 2), dtype=np.int32)),
                            question.question_id, paragraphs[para_num],
                            para_rank, para_num))
        rl = ResourceLoader()
    else:
        if args.corpus == "dev":
            docs = SquadCorpus().get_dev()
        else:
            docs = SquadCorpus().get_train()
        rl = SquadCorpus().get_resource_loader()

        if args.n_sample is not None:
            docs.sort(key=lambda x: x.doc_id)
            np.random.RandomState(0).shuffle(docs)
            docs = docs[:args.n_sample]

        for q in ranker.ranked_questions(docs):
            for i, p in enumerate(q.paragraphs):
                questions.append(
                    RankedParagraphQuestion(
                        q.question, TokenSpans(q.answer_text, p.answer_spans),
                        q.question_id,
                        ParagraphWithInverse([p.text], p.original_text,
                                             p.spans), i, p.paragraph_num))

    print("Split %d docs into %d paragraphs" % (len(docs), len(questions)))

    questions = sorted(questions,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)
    for q in questions:
        if len(q.answer.answer_spans.shape) != 2:
            raise ValueError()

    checkpoint = model_dir.get_best_weights()
    if checkpoint is not None:
        print("Using best weights")
    else:
        print("Using latest checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
        if checkpoint is None:
            raise ValueError("No checkpoints found")

    data = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    model = model_dir.get_model()
    evaluation = trainer.test(
        model, [RecordParagraphSpanPrediction(args.answer_bound, True)],
        {args.corpus: data}, rl, checkpoint, not args.no_ema,
        args. async)[args.corpus]

    print("Saving result")
    output_file = args.output

    df = pd.DataFrame(evaluation.per_sample)

    df.sort_values(["question_id", "rank"], inplace=True, ascending=True)
    group_by = ["question_id"]
    f1 = compute_ranked_scores(df, "predicted_score", "text_f1", group_by)
    em = compute_ranked_scores(df, "predicted_score", "text_em", group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i + 1), "%.4f" % e, "%.4f" % f]
                  for i, (e, f) in enumerate(zip(em, f1)))
    print_table(table)

    df.to_csv(output_file, index=False)
def main():
    data = split_docs(SquadCorpus().get_train())
    np.random.shuffle(data)
    for point in data:
        print(" ".join(point.question))
Пример #20
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("loss_mode", choices=['default', 'confidence'])
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if args.loss_mode == 'default':
        n_epochs = 24
        answer_encoder = SingleSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
        batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)
    elif args.loss_mode == 'confidence':
        if args.no_tfidf:
            prepro = SquadDefault()
            n_epochs = 15
        else:
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True)
            n_epochs = 50
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, 'flatten', True, 0)
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        data = PreprocessedData(SquadCorpus(),
                                prepro,
                                StratifyParagraphsBuilder(train_batching, 1),
                                eval_dataset,
                                eval_on_verified=False)
        data.preprocess(1)

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=n_epochs,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )
    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
Пример #21
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples")
    parser.add_argument('--elmo', action="store_true", help="Use elmo model")
    parser.add_argument('--per_question_loss_file', type=str, default=None,
            help="Run question by question and output a question_id -> loss output to this file")
    args = parser.parse_known_args()[0]

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]

    questions.sort(key=lambda x:x.n_context_words, reverse=True)
    dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True))

    evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")]
    if args.official_output is not None:
        evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))
    if args.per_question_loss_file is not None:
        evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob))

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()
    if args.elmo:
        model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt'
        model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json'
        model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5'
        model.lm_model.embed_weights_file = None


    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]

    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        quid_to_para = {}
        for x in questions:
            quid_to_para[x.question_id] = x.paragraph

        q_id_to_answers = {}
        q_ids = evaluation.per_sample["question_id"]
        spans = evaluation.per_sample["predicted_span"]
        for q_id, (start, end) in zip(q_ids, spans):
            text = quid_to_para[q_id].get_original_text(start, end)
            q_id_to_answers[q_id] = text

        with open(args.official_output, "w") as f:
            json.dump(q_id_to_answers, f)

    if args.per_question_loss_file is not None:
        print("Saving result")
        output_file = args.per_question_loss_file
        ids = evaluation.per_sample["question_ids"]
        f1s = evaluation.per_sample["text_f1"]
        ems = evaluation.per_sample["text_em"]
        losses = evaluation.per_sample["loss"]

        if args.none_prob:
            none_probs = evaluation.per_sample["none_probs"]
            """
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)}
            """
            results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)}
        else:
            results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)}


        with open(output_file, 'w') as f:
            json.dump(results, f)