def _run_model(_, args): run_file = args.test out_file = args.output vocab_file = args.vocab model_file = args.parameter_file batch_size = args.batch_size word_embeddings = None model_class = INCLUDED_MODELS.get(args.model_name) vocab = Vocabulary().load(vocab_file) model = model_class(vocab, word_embeddings) parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.load_from_file(model_file) run_data = vocab.tokenize_conll(run_file) predictions = parser.run(run_data, batch_size) write_predictions_to_file(predictions, reference_file=run_file, output_file=out_file, vocab=vocab) print(">> Wrote predictions to conllu file %s" % out_file)
def _eval_model(_, args): test_file = args.filename vocab_file = args.vocab model_file = args.parameter_file batch_size = args.batch_size word_embeddings = None model_class = INCLUDED_MODELS.get(args.model_name) vocab = Vocabulary().load(vocab_file) model = model_class(vocab, word_embeddings) parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.load_from_file(model_file) test_data = vocab.tokenize_conll(test_file) metrics = parser.evaluate(test_file, test_data, batch_size=batch_size) for key, value in metrics.items(): print(key, round(value, 3))
def _train_model(_, args): train_file = args.train dev_file = args.dev epochs = args.epochs vocab_dest = args.vocab model_dest = args.parameter_file batch_size = args.batch_size embedding_file = None model_class = INCLUDED_MODELS.get(args.model_name) if not model_class: raise ValueError("Model %s doesn't exist." % args.model) # Disable patience if there is no dev. set patience = args.patience if dev_file else -1 vocab = Vocabulary().fit(train_file, embedding_file) word_embeddings = vocab.load_embedding() if embedding_file else None if word_embeddings: print("> Embedding shape", word_embeddings.shape) # save vocab for reproducability later print("> Saving vocabulary to", vocab_dest) vocab.save(vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(train_file) dev_data = vocab.tokenize_conll(dev_file) if dev_file else None # instantiate model model = model_class(vocab, word_embeddings) # 'best' only saves models that improve results on the dev. set # 'epoch' saves models on each epoch to a file appended with the epoch number save_mode = "best" if dev_file else "epoch" save_callback = ModelSaveCallback(model_dest, mode=save_mode) callbacks = [save_callback] # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, dev_file, dev_data, epochs=epochs, batch_size=batch_size, callbacks=callbacks, patience=patience, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--decoder", dest="decoder", required=True) parser.add_argument("--model", dest="model", required=True) parser.add_argument("--batch_size", default=32) parser.add_argument("--epochs", default=15, type=int) arguments, unknown = parser.parse_known_args() vocab = Vocabulary() vocab = vocab.fit(arguments.train) # prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) model = DependencyParser(vocab) save_callback = ModelSaveCallback(arguments.model) # prep params parser = Model( model, decoder=arguments.decoder, loss="hinge", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=arguments.epochs, batch_size=arguments.batch_size, callbacks=[save_callback], ) # load best model model.load_from_file(arguments.model) metrics = parser.evaluate( arguments.test, test_data, batch_size=arguments.batch_size ) print(metrics)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--vocab_dest", dest="vocab_dest") parser.add_argument("--model_dest", dest="model_dest", required=True) parser.add_argument( "--embs", dest="embs", help="pre-trained embeddings file name", required=False ) parser.add_argument( "--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action="store_true", ) parser.add_argument("--patience", dest="patience", type=int, default=-1) arguments, unknown = parser.parse_known_args() n_epochs = arguments.epochs vocab = Vocabulary() if arguments.embs: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print("shape", embs.shape) else: vocab = vocab.fit(arguments.train) embs = None # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) # instantiate model model = DependencyParser(vocab, embs) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience, ) parser.load_from_file(arguments.model_dest) metrics = parser.evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch) print(">>> Test score:", test_UAS, test_LAS)
def main(): """Main function.""" argparser = argparse.ArgumentParser() argparser.add_argument("--train", required=True) argparser.add_argument("--dev", required=True) argparser.add_argument("--test", required=True) argparser.add_argument("--emb", dest="emb") argparser.add_argument("--epochs", dest="epochs", type=int, default=283) argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True) argparser.add_argument("--model_dest", dest="model_dest", required=True) argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3) argparser.add_argument("--dropout", type=int, default=0.33) arguments, _ = argparser.parse_known_args() # [Data] min_occur_count = 2 train_file = arguments.train dev_file = arguments.dev vocab_destination = arguments.vocab_dest model_destination = arguments.model_dest # [Network] word_dims = 100 tag_dims = 100 lstm_hiddens = 400 mlp_arc_size = 500 mlp_rel_size = 100 lstm_layers = arguments.lstm_layers dropout_emb = arguments.dropout dropout_lstm_input = arguments.dropout dropout_lstm_hidden = arguments.dropout dropout_mlp = arguments.dropout # [Hyperparameters for optimizer] learning_rate = 2e-3 decay = 0.75 decay_steps = 5000 beta_1 = 0.9 beta_2 = 0.9 epsilon = 1e-12 # [Run] batch_scale = 5000 # for scaled batching n_epochs = arguments.epochs vocab = Vocabulary() vocab = vocab.fit(train_file, arguments.emb, min_occur_count) embs = vocab.load_embedding(True) if arguments.emb else None vocab.save(vocab_destination) model = DozatManning( vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, pretrained_embeddings=embs, ) optimizer = dy.AdamTrainer( model.parameter_collection, learning_rate, beta_1, beta_2, epsilon ) # Callbacks custom_learning_update_callback = UpdateParamsCallback( optimizer, learning_rate, decay, decay_steps ) save_callback = ModelSaveCallback(model_destination) callbacks = [custom_learning_update_callback, save_callback] parser = Model( model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab, ) # Prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train( training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks, ) parser.load_from_file(model_destination) metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale) test_uas = metrics["nopunct_uas"] test_las = metrics["nopunct_las"] print() print(metrics) print(">> Test score:", test_uas, test_las)
lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init) """ Instantiate custom optimizer """ optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1, beta_2, epsilon) """ Callbacks """ custom_learning_update_callback = UpdateParamsCallback() save_callback = ModelSaveCallback(model_destination) if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(tensorboard_destination) callbacks = [tensorboard_logger, custom_learning_update_callback, save_callback] else: callbacks = [custom_learning_update_callback, save_callback] parser = Model( model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab) """ Prep data """ training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train(training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks) parser.load_from_file(model_destination) metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] tensorboard_logger.raw_write("test_UAS", test_UAS)
vocab = Vocabulary() vocab = vocab.fit(arguments.train) # prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) model = DependencyParser(vocab) save_callback = ModelSaveCallback(arguments.model) # prep params parser = Model(model, decoder=arguments.decoder, loss="hinge", optimizer="adam", strategy="bucket", vocab=vocab) parser.train(training_data, arguments.dev, dev_data, epochs=arguments.epochs, batch_size=arguments.batch_size, callbacks=[save_callback]) # load best model model.load_from_file(arguments.model) metrics = parser.evaluate(arguments.test, test_data,
# instantiate model model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = Model(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) parser.train(training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience) parser.load_from_file(arguments.model_dest) metrics = parser.evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch)