예제 #1
0
def build_vocab(info):
    # Check if vocab exists
    if config.skip_existing and data.vocab_exists(info):
        nbprint('Skipping Vocab (file exists)')
        return

    # Build vocab
    current_vocab_builder = get_vocab_builder(info)
    current_vocab_builder.build_vocab()
    vocab = current_vocab_builder.get_vocab()

    # Save Vocab
    data.save_vocab(vocab, info)
예제 #2
0
    help='file name of pre-trained model [model/textcnn.model]')

if __name__ == '__main__':
    conf = Config()
    conf.dump()
    args = parser.parse_args()

    if not os.path.isdir("logs"):
        os.mkdir("logs")
    if not os.path.isdir("model"):
        os.mkdir("model")

    print("Loading data...")
    train_iter, text_field, label_field = data.fasttext_dataloader(
        "data/train.txt", conf.batch_size)
    data.save_vocab(text_field.vocab, "model/text.vocab")
    data.save_vocab(label_field.vocab, "model/label.vocab")

    # Update configurations
    conf.embed_num = len(text_field.vocab)
    conf.class_num = len(label_field.vocab) - 1
    conf.kernel_sizes = [int(k) for k in conf.kernel_sizes.split(',')]

    # model
    if os.path.exists(args.model):
        print('Loading model from {}...'.format(args.model))
        cnn = torch.load(args.model)
    else:
        cnn = model.TextCNN(conf)

    print(cnn)
예제 #3
0
def _train(config):
    if config.dataset == "imdb":
        train_data, valid_data, test_data = create_imdb_data(config)
    else:
        train_data, valid_data, test_data = create_twenty_newsgroup_data(
            config)

    vocab_freq = train_data.get_word_lists()

    print("Data loaded!")
    vocab = GloVEVocab(vocab_freq,
                       config.embedding_file,
                       threshold=config.min_occurence)

    print("Vocab built! Size (%d)" % vocab.size())

    model = UsedModel(config, vocab, 2 if config.dataset == 'imdb' else 20)

    #create session
    gpu_configuration = gpu_config()
    sess = tf.Session(config=gpu_configuration)
    with sess.as_default():
        model.build_graph()
        print("Graph built!")
        model.add_train_op()
        print("Train op added!")
    sess.run(tf.global_variables_initializer())
    print("Variables initialized")

    if config.continue_training:
        start_e, steps, out_dir, ckpt_dir = restore_from_last_ckpt(
            config, model, sess)

        # backup new argv
        with open(os.path.join(out_dir, 'argv.txt'), 'a') as f:
            f.write("\n")
            f.write(" ".join(sys.argv))

        print("Continue training after epoch %d, step %d" % (start_e, steps))

    else:
        if config.model_name == 'default':
            c_time = time.strftime("%m_%d_%H_%M_%S", time.localtime())
            config.model_name = UsedModel.__name__ + "_%s" % c_time

        if config.debug:
            config.checkpoint_size = 10

        if not config.debug:
            out_dir = os.path.join(config.out_root, config.model_name)
            if os.path.exists(out_dir):
                raise ValueError("Output directory already exists!")
            else:
                os.makedirs(out_dir)

            # back up src file
            os.system("cp -r src %s" % os.path.join(out_dir, 'src'))
            # back up argv
            with open(os.path.join(out_dir, "argv.txt"), 'w') as f:
                f.write(" ".join(sys.argv))

            # back up environ
            with open(os.path.join(out_dir, 'recreate_environ.sh'), 'w') as f:
                for var, val in os.environ.items():
                    f.write("export %s=\"%s\"\n" % (var, val))

            os.system("chmod +x %s" %
                      os.path.join(out_dir, 'recreate_environ.sh'))

            ckpt_dir = os.path.join(out_dir, "ckpts")

            vocab_loc = os.path.join(out_dir, "vocab.pkl")
            save_vocab(vocab, vocab_loc)

            print("Initialized output at %s" % out_dir)

        steps = 0
        start_e = -1

        print("Started training!")

    #construct graph handler
    summary_handler = SummaryHandler(
        os.path.join(config.summary_save_path, config.model_name),
        ['LOSS', 'ACCURACY'])

    for e in range(config.num_epochs):
        total_loss = []
        grad_norms = []
        for batches in tqdm(train_data.get_batches(config.batch_size)):

            if steps != 0 or not config.start_eval:
                steps += 1

            if steps > 10 and config.debug:
                exit(0)

            is_training = True

            fd = model.encode(batches, is_training)
            loss, grad_norm = model.train_step(sess, fd)
            total_loss.append(loss)
            grad_norms.append(grad_norm)

            if steps % config.checkpoint_size == 0:
                accuracy = eval_model(config, valid_data, vocab, model, sess)
                print("Result at step %d: %f" % (steps, accuracy))
                print("avg lost: %f" % (sum(total_loss) / len(total_loss)))
                print("avg grad norm: %f" %
                      (sum(grad_norms) / len(grad_norms)))

                if not config.debug:
                    summary_handler.write_summaries(
                        sess, {
                            'ITERATION': steps,
                            'LOSS': avg(total_loss),
                            'ACCURACY': accuracy
                        })

                    if start_e > 0:
                        epoch = e + start_e
                    else:
                        epoch = e

                    model.save_to(
                        sess,
                        os.path.join(
                            ckpt_dir, 'epoch_%04d_step%08d_acc(%f)' %
                            (epoch, steps, accuracy)))

    summary_handler.close_writer()
예제 #4
0
        print("Training epoch %d/%d, loss: %10.4f" % (epoch, epochs, loss))
        if epoch % save_interval == 0:
            save_model(model, epoch)
    print("Training finished.")


if __name__ == '__main__':
    args = parse_arguments()
    hidden_size = 512
    embed_size = 256
    assert torch.cuda.is_available()

    traindata, en_field, zh_field = data.translate_dataloader("data/en-zh.txt",
                                                              args.batch_size,
                                                              shuffle=True)
    data.save_vocab(en_field.vocab, "models/english.vocab")
    data.save_vocab(zh_field.vocab, "models/chinese.vocab")

    en_size = len(en_field.vocab)
    zh_size = len(zh_field.vocab)
    zh_pad = zh_field.vocab.stoi['<pad>']

    if os.path.exists(args.model):
        seq2seq = torch.load(args.model)
        seq2seq = seq2seq.cuda()
    else:
        encoder = model.Encoder(en_size,
                                embed_size,
                                hidden_size,
                                n_layers=2,
                                dropout=0.5)
예제 #5
0
def _train(config):
    import_model(config)

    train_data = load_processed_dataset(config, 'train')
    train_commonsense_list = []

    vocab_freq = train_data.get_word_lists()
    relations_vocab = COMMONSENSE_REL_LOOKUP.values()

    valid_data = load_processed_dataset(config, 'valid')

    print("Data loaded!")
    vocab = GenModelVocab(vocab_freq,
                          config.embedding_size,
                          forced_word_list=relations_vocab,
                          cs=train_commonsense_list,
                          threshold=config.min_occurence)

    print("Vocab built! Size (%d)" % vocab.size())

    model = UsedModel(config, vocab)

    #create session
    gpu_configuration = gpu_config()
    sess = tf.Session(config=gpu_configuration)
    with sess.as_default():
        model.build_graph()
        print("Graph built!")
        model.add_train_op()
        print("Train op added!")
    sess.run(tf.global_variables_initializer())
    print("Variables initialized")

    if config.continue_training:
        start_e, steps, out_dir, ckpt_dir = restore_from_last_ckpt(
            config, model, sess)

        # backup new argv
        with open(os.path.join(out_dir, 'argv.txt'), 'a') as f:
            f.write("\n")
            f.write(" ".join(sys.argv))

        print("Continue training after epoch %d, step %d" % (start_e, steps))

    else:
        if config.model_name == 'default':
            c_time = time.strftime("%m_%d_%H_%M_%S", time.localtime())
            commonsense = ""
            if config.load_commonsense:
                commonsense = "_with_cs"

            if config.shuffle_cs:
                commonsense = "_shuffle_CS"

            config.model_name = UsedModel.__name__ + commonsense + "_%s" % c_time

        if config.debug:
            config.checkpoint_size = 10

        if not config.debug:
            # XXX factor into class eventually
            out_dir = os.path.join(config.out_root, config.model_name)
            if os.path.exists(out_dir):
                raise ValueError("Output directory already exists!")
            else:
                os.makedirs(out_dir)

            # back up src file
            os.system("cp -r src %s" % os.path.join(out_dir, 'src'))
            # back up argv
            with open(os.path.join(out_dir, "argv.txt"), 'w') as f:
                f.write(" ".join(sys.argv))

            # back up environ
            with open(os.path.join(out_dir, 'recreate_environ.sh'), 'w') as f:
                for var, val in os.environ.items():
                    f.write("export %s=\"%s\"\n" % (var, val))

            os.system("chmod +x %s" %
                      os.path.join(out_dir, 'recreate_environ.sh'))

            ckpt_dir = os.path.join(out_dir, "ckpts")

            vocab_loc = os.path.join(out_dir, "vocab.pkl")
            save_vocab(vocab, vocab_loc)

            print("Initialized output at %s" % out_dir)

        steps = 0
        start_e = -1

        print("Started training!")

    #construct graph handler
    if not config.multiple_choice:
        summary_handler = SummaryHandler(
            os.path.join(config.summary_save_path, config.model_name), [
                'LOSS', 'PERPLEXITY', 'EVAL_PERPLEXITY', 'BLEU_1', 'BLEU_4',
                'METEOR', 'ROUGE', 'CIDER'
            ])
    else:
        summary_handler = SummaryHandler(
            os.path.join(config.summary_save_path, config.model_name),
            ['LOSS', 'ACCURACY'])

    for e in range(config.num_epochs):
        total_loss = []
        for batches in tqdm(
                train_data.get_batches(config.batch_size,
                                       front_heavy=(e == 0))):

            if steps != 0 or not config.start_eval:
                steps += 1

            if steps > 10 and config.debug:
                exit(0)

            is_training = True

            fd = model.encode(batches, is_training)
            oovs = model.get_batch_oov()

            loss = model.train_step(sess, fd)
            total_loss.append(loss)

            if steps % config.checkpoint_size == 0:
                if not config.multiple_choice:
                    bleu1, bleu4, meteor, rouge, cider, eval_loss, preds = eval_dataset(
                        config, valid_data, vocab, model, sess)

                    print("Result at step %d:" % steps)
                    print("Bleu1: ", bleu1)
                    print("Bleu4: ", bleu4)
                    print("Meteor: ", meteor)
                    print("Rouge-L: ", rouge)
                    print("CIDEr: ", cider)

                    if not config.debug:
                        write_summaries(sess, summary_handler, avg(total_loss),
                                        eval_loss, bleu1, bleu4, meteor, rouge,
                                        cider, steps)

                        if start_e > 0:
                            epoch = e + start_e
                        else:
                            epoch = e

                        model.save_to(
                            sess,
                            os.path.join(
                                ckpt_dir,
                                "epoch_%04d_step%08d_bleu1(%f)_bleu4(%f)_meteor(%f)_rogue(%f)"
                                % (epoch, steps, bleu1, bleu4, meteor, rouge)))
                        print("Model saved!")
                else:
                    accuracy = eval_multiple_choice_dataset(
                        config, valid_data, vocab, model, sess)
                    print("Result at step %d: %f" % (steps, accuracy))

                    if not config.debug:
                        summary_handler.write_summaries(
                            sess, {
                                'ITERATION': steps,
                                'LOSS': avg(total_loss),
                                'ACCURACY': accuracy
                            })

                        if start_e > 0:
                            epoch = e + start_e
                        else:
                            epoch = e

                        model.save_to(
                            sess,
                            os.path.join(
                                ckpt_dir, 'epoch_%04d_step%08d_acc(%f)' %
                                (epoch, steps, accuracy)))

    summary_handler.close_writer()