def do_training(arguments, vocab): logging.debug("Init training") n_epochs = arguments.epochs batch_size = arguments.batch_size # prep data logging.info(">> Loading in data") logging.info("tokenizing train data ...") training_data = vocab.tokenize_conll(arguments.train) logging.info("... tokenized train data") if arguments.dev_mode: training_data = training_data[:100] logging.info("tokenizing dev data ...") dev_data = vocab.tokenize_conll(arguments.dev) logging.info("... tokenized dev data") # instantiate model logging.info("creating model ...") model = DependencyParser(vocab, arguments.upos_dim, arguments.word_dim, arguments.hidden_dim) logging.info("... model created") callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) logging.info("creating ModelSaveCallback ...") save_callback = ModelSaveCallback(arguments.model_file) callbacks.append(save_callback) logging.info("... ModelSaveCallback created") # prep params logging.info("creating Model ...") parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) logging.info("... Model created") logging.info("training Model ...") parser.train(training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, patience=arguments.patience) logging.info("...Model trained") logging.info("Model maxed on dev at epoch %s " % (save_callback.best_epoch)) return parser
def _train_model(_, args): train_file = args.train dev_file = args.dev epochs = args.epochs vocab_dest = args.vocab model_dest = args.parameter_file batch_size = args.batch_size embedding_file = None model_class = INCLUDED_MODELS.get(args.model_name) if not model_class: raise ValueError("Model %s doesn't exist." % args.model) # Disable patience if there is no dev. set patience = args.patience if dev_file else -1 vocab = Vocabulary().fit(train_file, embedding_file) word_embeddings = vocab.load_embedding() if embedding_file else None if word_embeddings: print("> Embedding shape", word_embeddings.shape) # save vocab for reproducability later print("> Saving vocabulary to", vocab_dest) vocab.save(vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(train_file) dev_data = vocab.tokenize_conll(dev_file) if dev_file else None # instantiate model model = model_class(vocab, word_embeddings) # 'best' only saves models that improve results on the dev. set # 'epoch' saves models on each epoch to a file appended with the epoch number save_mode = "best" if dev_file else "epoch" save_callback = ModelSaveCallback(model_dest, mode=save_mode) callbacks = [save_callback] # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, dev_file, dev_data, epochs=epochs, batch_size=batch_size, callbacks=callbacks, patience=patience, )
def do_training_big_datasets(arguments, vocab, embs, subset_size): logging.debug("Init training with big dataset (there is no dev mode)") n_epochs = arguments.epochs batch_size = arguments.batch_size logging.info("tokenizing dev data ...") dev_data = vocab.tokenize_conll(arguments.dev) logging.info("... tokenized dev data") # instantiate model logging.info("creating model ...") model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) logging.info("... model created") callbacks = [] if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) logging.info("creating ModelSaveCallback ...") save_callback = ModelSaveCallback(arguments.model_file) callbacks.append(save_callback) logging.info("... ModelSaveCallback created") # prep params logging.info("creating Model ...") parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) logging.info("... Model created") logging.info("training Model ...") parser.train_big_datasets(arguments.train, arguments.dev, dev_data, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, patience=arguments.patience, subset_size=subset_size) logging.info("...Model trained") logging.info("Model maxed on dev at epoch %s " % (save_callback.best_epoch)) return parser
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--decoder", dest="decoder", required=True) parser.add_argument("--model", dest="model", required=True) parser.add_argument("--batch_size", default=32) parser.add_argument("--epochs", default=15, type=int) arguments, unknown = parser.parse_known_args() vocab = Vocabulary() vocab = vocab.fit(arguments.train) # prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) model = DependencyParser(vocab) save_callback = ModelSaveCallback(arguments.model) # prep params parser = Model( model, decoder=arguments.decoder, loss="hinge", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=arguments.epochs, batch_size=arguments.batch_size, callbacks=[save_callback], ) # load best model model.load_from_file(arguments.model) metrics = parser.evaluate( arguments.test, test_data, batch_size=arguments.batch_size ) print(metrics)
parser.add_argument("--batch_size", default=32) parser.add_argument("--epochs", default=15) arguments, unknown = parser.parse_known_args() vocab = Vocabulary() vocab = vocab.fit(arguments.train) # prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) model = DependencyParser(vocab) save_callback = ModelSaveCallback(arguments.model) # prep params parser = ParserModel(model, decoder=arguments.decoder, loss="hinge", optimizer="adam", strategy="bucket", vocab=vocab) parser.train(training_data, arguments.dev, dev_data, epochs=arguments.epochs, batch_size=arguments.batch_size, callbacks=[save_callback])
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--vocab_dest", dest="vocab_dest") parser.add_argument("--model_dest", dest="model_dest", required=True) parser.add_argument( "--embs", dest="embs", help="pre-trained embeddings file name", required=False ) parser.add_argument( "--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action="store_true", ) parser.add_argument("--patience", dest="patience", type=int, default=-1) arguments, unknown = parser.parse_known_args() n_epochs = arguments.epochs vocab = Vocabulary() if arguments.embs: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print("shape", embs.shape) else: vocab = vocab.fit(arguments.train) embs = None # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) # instantiate model model = DependencyParser(vocab, embs) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience, ) parser.load_from_file(arguments.model_dest) metrics = parser.evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch) print(">>> Test score:", test_UAS, test_LAS)
def main(): """Main function.""" argparser = argparse.ArgumentParser() argparser.add_argument("--train", required=True) argparser.add_argument("--dev", required=True) argparser.add_argument("--test", required=True) argparser.add_argument("--emb", dest="emb") argparser.add_argument("--epochs", dest="epochs", type=int, default=283) argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True) argparser.add_argument("--model_dest", dest="model_dest", required=True) argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3) argparser.add_argument("--dropout", type=int, default=0.33) arguments, _ = argparser.parse_known_args() # [Data] min_occur_count = 2 train_file = arguments.train dev_file = arguments.dev vocab_destination = arguments.vocab_dest model_destination = arguments.model_dest # [Network] word_dims = 100 tag_dims = 100 lstm_hiddens = 400 mlp_arc_size = 500 mlp_rel_size = 100 lstm_layers = arguments.lstm_layers dropout_emb = arguments.dropout dropout_lstm_input = arguments.dropout dropout_lstm_hidden = arguments.dropout dropout_mlp = arguments.dropout # [Hyperparameters for optimizer] learning_rate = 2e-3 decay = 0.75 decay_steps = 5000 beta_1 = 0.9 beta_2 = 0.9 epsilon = 1e-12 # [Run] batch_scale = 5000 # for scaled batching n_epochs = arguments.epochs vocab = Vocabulary() vocab = vocab.fit(train_file, arguments.emb, min_occur_count) embs = vocab.load_embedding(True) if arguments.emb else None vocab.save(vocab_destination) model = DozatManning( vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, pretrained_embeddings=embs, ) optimizer = dy.AdamTrainer( model.parameter_collection, learning_rate, beta_1, beta_2, epsilon ) # Callbacks custom_learning_update_callback = UpdateParamsCallback( optimizer, learning_rate, decay, decay_steps ) save_callback = ModelSaveCallback(model_destination) callbacks = [custom_learning_update_callback, save_callback] parser = Model( model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab, ) # Prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train( training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks, ) parser.load_from_file(model_destination) metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale) test_uas = metrics["nopunct_uas"] test_las = metrics["nopunct_las"] print() print(metrics) print(">> Test score:", test_uas, test_las)
variance_normalize=True) if arguments.embedding_file else None # save vocab for reproducing later if vocab_destination: vocab.save(vocab_destination) print("> saving vocab to", vocab_destination) """ """ model = BaseParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init) """ Instantiate custom optimizer """ optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1, beta_2, epsilon) """ Callbacks """ custom_learning_update_callback = UpdateParamsCallback() save_callback = ModelSaveCallback(model_destination) if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(tensorboard_destination) callbacks = [ tensorboard_logger, custom_learning_update_callback, save_callback ] else: callbacks = [custom_learning_update_callback, save_callback] parser = ParserModel(model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab) """ Prep data """