예제 #1
0
def do_training(arguments, vocab):
    logging.debug("Init training")
    n_epochs = arguments.epochs
    batch_size = arguments.batch_size

    # prep data
    logging.info(">> Loading in data")

    logging.info("tokenizing train data ...")
    training_data = vocab.tokenize_conll(arguments.train)
    logging.info("... tokenized train data")

    if arguments.dev_mode:
        training_data = training_data[:100]

    logging.info("tokenizing dev data ...")
    dev_data = vocab.tokenize_conll(arguments.dev)
    logging.info("... tokenized dev data")

    # instantiate model
    logging.info("creating model ...")
    model = DependencyParser(vocab, arguments.upos_dim, arguments.word_dim,
                             arguments.hidden_dim)
    logging.info("... model created")

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    logging.info("creating ModelSaveCallback ...")
    save_callback = ModelSaveCallback(arguments.model_file)
    callbacks.append(save_callback)
    logging.info("... ModelSaveCallback created")

    # prep params
    logging.info("creating Model ...")
    parser = ParserModel(model,
                         decoder="eisner",
                         loss="kiperwasser",
                         optimizer="adam",
                         strategy="bucket",
                         vocab=vocab)
    logging.info("... Model created")

    logging.info("training Model ...")
    parser.train(training_data,
                 arguments.dev,
                 dev_data,
                 epochs=n_epochs,
                 batch_size=batch_size,
                 callbacks=callbacks,
                 patience=arguments.patience)
    logging.info("...Model trained")

    logging.info("Model maxed on dev at epoch %s " %
                 (save_callback.best_epoch))

    return parser
예제 #2
0
파일: cli.py 프로젝트: paulmayer/uniparse
def _train_model(_, args):
    train_file = args.train
    dev_file = args.dev
    epochs = args.epochs
    vocab_dest = args.vocab
    model_dest = args.parameter_file
    batch_size = args.batch_size
    embedding_file = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    if not model_class:
        raise ValueError("Model %s doesn't exist." % args.model)

    # Disable patience if there is no dev. set
    patience = args.patience if dev_file else -1

    vocab = Vocabulary().fit(train_file, embedding_file)
    word_embeddings = vocab.load_embedding() if embedding_file else None
    if word_embeddings:
        print("> Embedding shape", word_embeddings.shape)

    # save vocab for reproducability later
    print("> Saving vocabulary to", vocab_dest)
    vocab.save(vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(train_file)

    dev_data = vocab.tokenize_conll(dev_file) if dev_file else None

    # instantiate model
    model = model_class(vocab, word_embeddings)

    # 'best' only saves models that improve results on the dev. set
    # 'epoch' saves models on each epoch to a file appended with the epoch number
    save_mode = "best" if dev_file else "epoch"
    save_callback = ModelSaveCallback(model_dest, mode=save_mode)
    callbacks = [save_callback]

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        patience=patience,
    )
예제 #3
0
def do_training_big_datasets(arguments, vocab, embs, subset_size):
    logging.debug("Init training with big dataset (there is no dev mode)")
    n_epochs = arguments.epochs
    batch_size = arguments.batch_size

    logging.info("tokenizing dev data ...")
    dev_data = vocab.tokenize_conll(arguments.dev)
    logging.info("... tokenized dev data")

    # instantiate model
    logging.info("creating model ...")
    model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb)
    logging.info("... model created")

    callbacks = []
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    logging.info("creating ModelSaveCallback ...")
    save_callback = ModelSaveCallback(arguments.model_file)
    callbacks.append(save_callback)
    logging.info("... ModelSaveCallback created")

    # prep params
    logging.info("creating Model ...")
    parser = ParserModel(model,
                         decoder="eisner",
                         loss="kiperwasser",
                         optimizer="adam",
                         strategy="bucket",
                         vocab=vocab)
    logging.info("... Model created")

    logging.info("training Model ...")
    parser.train_big_datasets(arguments.train,
                              arguments.dev,
                              dev_data,
                              epochs=n_epochs,
                              batch_size=batch_size,
                              callbacks=callbacks,
                              patience=arguments.patience,
                              subset_size=subset_size)
    logging.info("...Model trained")

    logging.info("Model maxed on dev at epoch %s " %
                 (save_callback.best_epoch))

    return parser
예제 #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train",
        dest="train",
        help="Annotated CONLL train file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--dev",
        dest="dev",
        help="Annotated CONLL dev file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--test",
        dest="test",
        help="Annotated CONLL dev test",
        metavar="FILE",
        required=True,
    )
    parser.add_argument("--decoder", dest="decoder", required=True)
    parser.add_argument("--model", dest="model", required=True)

    parser.add_argument("--batch_size", default=32)
    parser.add_argument("--epochs", default=15, type=int)

    arguments, unknown = parser.parse_known_args()

    vocab = Vocabulary()
    vocab = vocab.fit(arguments.train)

    # prep data
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    model = DependencyParser(vocab)

    save_callback = ModelSaveCallback(arguments.model)

    # prep params
    parser = Model(
        model,
        decoder=arguments.decoder,
        loss="hinge",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )

    parser.train(
        training_data,
        arguments.dev,
        dev_data,
        epochs=arguments.epochs,
        batch_size=arguments.batch_size,
        callbacks=[save_callback],
    )

    # load best model
    model.load_from_file(arguments.model)

    metrics = parser.evaluate(
        arguments.test, test_data, batch_size=arguments.batch_size
    )

    print(metrics)
예제 #5
0
parser.add_argument("--batch_size", default=32)
parser.add_argument("--epochs", default=15)

arguments, unknown = parser.parse_known_args()

vocab = Vocabulary()
vocab = vocab.fit(arguments.train)

# prep data
training_data = vocab.tokenize_conll(arguments.train)
dev_data = vocab.tokenize_conll(arguments.dev)
test_data = vocab.tokenize_conll(arguments.test)

model = DependencyParser(vocab)

save_callback = ModelSaveCallback(arguments.model)

# prep params
parser = ParserModel(model,
                     decoder=arguments.decoder,
                     loss="hinge",
                     optimizer="adam",
                     strategy="bucket",
                     vocab=vocab)

parser.train(training_data,
             arguments.dev,
             dev_data,
             epochs=arguments.epochs,
             batch_size=arguments.batch_size,
             callbacks=[save_callback])
예제 #6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train",
        dest="train",
        help="Annotated CONLL train file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--dev",
        dest="dev",
        help="Annotated CONLL dev file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--test",
        dest="test",
        help="Annotated CONLL dev test",
        metavar="FILE",
        required=True,
    )
    parser.add_argument("--epochs", dest="epochs", type=int, default=30)
    parser.add_argument("--tb_dest", dest="tb_dest")
    parser.add_argument("--vocab_dest", dest="vocab_dest")
    parser.add_argument("--model_dest", dest="model_dest", required=True)
    parser.add_argument(
        "--embs", dest="embs", help="pre-trained embeddings file name", required=False
    )
    parser.add_argument(
        "--no_update_pretrained_emb",
        dest="no_update_pretrained_emb",
        help="don't update the pretrained embeddings during training",
        default=False,
        action="store_true",
    )
    parser.add_argument("--patience", dest="patience", type=int, default=-1)

    arguments, unknown = parser.parse_known_args()

    n_epochs = arguments.epochs

    vocab = Vocabulary()
    if arguments.embs:
        vocab = vocab.fit(arguments.train, arguments.embs)
        embs = vocab.load_embedding()
        print("shape", embs.shape)
    else:
        vocab = vocab.fit(arguments.train)
        embs = None

    # save vocab for reproducability later
    if arguments.vocab_dest:
        print("> saving vocab to", arguments.vocab_dest)
        vocab.save(arguments.vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    # instantiate model
    model = DependencyParser(vocab, embs)

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    save_callback = ModelSaveCallback(arguments.model_dest)
    callbacks.append(save_callback)

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        arguments.dev,
        dev_data,
        epochs=n_epochs,
        batch_size=32,
        callbacks=callbacks,
        patience=arguments.patience,
    )
    parser.load_from_file(arguments.model_dest)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=32)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    print(metrics)

    if arguments.tb_dest and tensorboard_logger:
        tensorboard_logger.raw_write("test_UAS", test_UAS)
        tensorboard_logger.raw_write("test_LAS", test_LAS)

    print()
    print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
    print(">>> Test score:", test_UAS, test_LAS)
예제 #7
0
def main():
    """Main function."""
    argparser = argparse.ArgumentParser()

    argparser.add_argument("--train", required=True)
    argparser.add_argument("--dev", required=True)
    argparser.add_argument("--test", required=True)
    argparser.add_argument("--emb", dest="emb")
    argparser.add_argument("--epochs", dest="epochs", type=int, default=283)
    argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True)
    argparser.add_argument("--model_dest", dest="model_dest", required=True)

    argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3)
    argparser.add_argument("--dropout", type=int, default=0.33)

    arguments, _ = argparser.parse_known_args()

    # [Data]
    min_occur_count = 2
    train_file = arguments.train
    dev_file = arguments.dev
    vocab_destination = arguments.vocab_dest
    model_destination = arguments.model_dest

    # [Network]
    word_dims = 100
    tag_dims = 100
    lstm_hiddens = 400
    mlp_arc_size = 500
    mlp_rel_size = 100
    lstm_layers = arguments.lstm_layers
    dropout_emb = arguments.dropout
    dropout_lstm_input = arguments.dropout
    dropout_lstm_hidden = arguments.dropout
    dropout_mlp = arguments.dropout

    # [Hyperparameters for optimizer]
    learning_rate = 2e-3
    decay = 0.75
    decay_steps = 5000
    beta_1 = 0.9
    beta_2 = 0.9
    epsilon = 1e-12

    # [Run]
    batch_scale = 5000  # for scaled batching
    n_epochs = arguments.epochs

    vocab = Vocabulary()
    vocab = vocab.fit(train_file, arguments.emb, min_occur_count)
    embs = vocab.load_embedding(True) if arguments.emb else None

    vocab.save(vocab_destination)

    model = DozatManning(
        vocab,
        word_dims,
        tag_dims,
        dropout_emb,
        lstm_layers,
        lstm_hiddens,
        dropout_lstm_input,
        dropout_lstm_hidden,
        mlp_arc_size,
        mlp_rel_size,
        dropout_mlp,
        pretrained_embeddings=embs,
    )

    optimizer = dy.AdamTrainer(
        model.parameter_collection, learning_rate, beta_1, beta_2, epsilon
    )

    # Callbacks
    custom_learning_update_callback = UpdateParamsCallback(
        optimizer, learning_rate, decay, decay_steps
    )
    save_callback = ModelSaveCallback(model_destination)
    callbacks = [custom_learning_update_callback, save_callback]

    parser = Model(
        model,
        decoder="cle",
        loss="crossentropy",
        optimizer=optimizer,
        strategy="scaled_batch",
        vocab=vocab,
    )

    # Prep data
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=n_epochs,
        batch_size=batch_scale,
        callbacks=callbacks,
    )

    parser.load_from_file(model_destination)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale)
    test_uas = metrics["nopunct_uas"]
    test_las = metrics["nopunct_las"]

    print()
    print(metrics)
    print(">> Test score:", test_uas, test_las)
예제 #8
0
    variance_normalize=True) if arguments.embedding_file else None

# save vocab for reproducing later
if vocab_destination:
    vocab.save(vocab_destination)
    print("> saving vocab to", vocab_destination)
""" """
model = BaseParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers,
                   lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                   mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init)
""" Instantiate custom optimizer """
optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1,
                           beta_2, epsilon)
""" Callbacks """
custom_learning_update_callback = UpdateParamsCallback()
save_callback = ModelSaveCallback(model_destination)
if arguments.tb_dest:
    tensorboard_logger = TensorboardLoggerCallback(tensorboard_destination)
    callbacks = [
        tensorboard_logger, custom_learning_update_callback, save_callback
    ]
else:
    callbacks = [custom_learning_update_callback, save_callback]

parser = ParserModel(model,
                     decoder="cle",
                     loss="crossentropy",
                     optimizer=optimizer,
                     strategy="scaled_batch",
                     vocab=vocab)
""" Prep data """