예제 #1
0
def decode(vocab_file, model_file, input_file, no_prog):
    d = pickle.load(open(vocab_file, "rb"))
    wv = WordVocab.from_dump(d["vocab"]["word"])
    tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()}
    writer = d["author"] if "writer" in model_file else None
    model = Reporter.parse_config(tv=tv,
                                  wv=wv,
                                  writer=writer,
                                  model_file=model_file)

    inputs = json.load(open(input_file))
    for ins in tqdm(inputs, total=len(inputs), ncols=80, disable=no_prog):
        print(
            model.decode(
                make_table(ins),
                writer=writer.get(ins.get("author"), 0) if writer else None))
예제 #2
0
def train(vocab_file, valid_file, nh_vocab, nh_rnn, writer, learning_rate,
          lr_decay, batch_size, n_epoch, log_dir):
    log_dir = os.path.join(log_dir, str(int(time.time())))

    # Initialize...
    print(str(datetime.datetime.now()) + " Log dir at {}".format(log_dir))
    os.mkdir(log_dir)
    print(str(datetime.datetime.now()) + " Loading dataset...")
    d = pickle.load(open(vocab_file, "rb"))
    texts, tables = d["data"]["text"], d["data"]["table"]
    wv = WordVocab.from_dump(d["vocab"]["word"])
    tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()}
    writer = d["author"] if writer else None

    print(str(datetime.datetime.now()) + " Vectorizing...")
    data = list(vectorize(texts, tables, wv, tv, writer))

    valid = json.load(open(valid_file)) if valid_file else None

    # Model
    model = Reporter(tv=tv,
                     wv=wv,
                     nh_vocab=nh_vocab,
                     nh_rnn=nh_rnn,
                     writer=writer)
    print(str(datetime.datetime.now()) + " Model configurations...")
    print(str(datetime.datetime.now()) + " " + str(model))

    # Trainer
    trainer = Trainer(model,
                      lr=learning_rate,
                      decay=lr_decay,
                      batch_size=batch_size)
    print(str(datetime.datetime.now()) + " Trainer configurations...")
    print(str(datetime.datetime.now()) + " " + str(trainer))

    try:
        best = 0.
        print(str(datetime.datetime.now()) + " Start training...")
        for _ in range(n_epoch):
            trainer.fit_partial(data)
            pc_name = str(model) + "_{}.dy".format(trainer.iter)
            model.pc.save(os.path.join(log_dir, pc_name))

            if valid and trainer.iter >= 5:
                pred = []
                prog = tqdm(
                    desc="Evaluation: ",
                    total=len(valid) + 1,
                    ncols=80,
                )
                for ins in valid:
                    p = model.decode(make_table(ins),
                                     writer=writer.get(ins.get("author"))
                                     if writer else None)
                    pred.append(p.split())
                    prog.update()

                bleu = nltk.translate.bleu_score.corpus_bleu(
                    [[nltk.word_tokenize(' '.join(v["summary"]))]
                     for v in valid], pred)
                prog.set_postfix(BLEU=bleu)
                prog.update()
                prog.close()
                if bleu > best:
                    best = bleu
                    print(str(datetime.datetime.now()) + " Save best model...")
                    model.pc.save(
                        os.path.join(log_dir,
                                     str(model) + "_best.dy"))

    except KeyboardInterrupt:
        print("KeyboardInterrupted...")