def load_or_create_vocab_and_embs(arguments): vocab = Vocabulary(arguments.only_words) # load or create vocabulary try: vocab.load(arguments.vocab_file) except: if arguments.embs == None: vocab = vocab.fit(arguments.train) else: vocab = vocab.fit(arguments.train, arguments.embs) # save vocab for reproducability later logging.info("> saving vocab to %s" % (arguments.vocab_file)) vocab.save(arguments.vocab_file) if arguments.embs == None: embs = None else: embs = vocab.load_embedding() logging.info('shape %s' % (embs.shape)) return vocab, embs
def load_or_create_vocab(arguments): vocab = Vocabulary(arguments.only_words) # load or create vocabulary try: vocab.load(arguments.vocab_file) except: vocab = vocab.fit(arguments.train) # save vocab for reproducability later logging.info("> saving vocab to %s" % (arguments.vocab_file)) vocab.save(arguments.vocab_file) return vocab
parser = argparse.ArgumentParser() parser.add_argument("--train", required=True) parser.add_argument("--dev", required=True) parser.add_argument("--test", required=True) parser.add_argument("--model", required=True) arguments, unknown = parser.parse_known_args() TRAIN_FILE = arguments.train DEV_FILE = arguments.dev TEST_FILE = arguments.test MODEL_FILE = arguments.model n_epochs = 5 vocab = Vocabulary() vocab.fit(TRAIN_FILE) print(">> Loading in data") TRAIN = vocab.tokenize_conll(arguments.train) DEV = vocab.tokenize_conll(arguments.dev) TEST = vocab.tokenize_conll(arguments.test) encoder = BetaEncodeHandler() print("> pre-encoding edges") s = time.time() TRAIN = pre_encode(encoder, TRAIN, accumulate_vocab=True) DEV = pre_encode(encoder, DEV) TEST = pre_encode(encoder, TEST) print(">> done pre-encoding", time.time() - s) # 5m is completely arbitrary
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--vocab_dest", dest="vocab_dest") parser.add_argument("--model_dest", dest="model_dest", required=True) parser.add_argument( "--embs", dest="embs", help="pre-trained embeddings file name", required=False ) parser.add_argument( "--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action="store_true", ) parser.add_argument("--patience", dest="patience", type=int, default=-1) arguments, unknown = parser.parse_known_args() n_epochs = arguments.epochs vocab = Vocabulary() if arguments.embs: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print("shape", embs.shape) else: vocab = vocab.fit(arguments.train) embs = None # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) # instantiate model model = DependencyParser(vocab, embs) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience, ) parser.load_from_file(arguments.model_dest) metrics = parser.evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch) print(">>> Test score:", test_UAS, test_LAS)
parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--vocab_dest", dest="vocab_dest") parser.add_argument("--model_dest", dest="model_dest", required=True) parser.add_argument("--embs", dest="embs", help="pre-trained embeddings file name", required=False) parser.add_argument("--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action='store_true') parser.add_argument("--patience", dest='patience', type=int, default=-1) parser.add_argument("--dev_mode", dest='dev_mode', default=False, help='small subset of training examples, for code testing', action='store_true') arguments, unknown = parser.parse_known_args() n_epochs = arguments.epochs vocab = Vocabulary() if arguments.embs == None: vocab = vocab.fit(arguments.train) embs = None else: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print('shape',embs.shape) # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) if arguments.dev_mode:
ARGPARSER = argparse.ArgumentParser() ARGPARSER.add_argument("--train", required=True) ARGPARSER.add_argument("--dev", required=True) ARGPARSER.add_argument("--test", required=True) ARGPARSER.add_argument("--model", required=True) ARGUMENTS, UNK = ARGPARSER.parse_known_args() TRAIN_FILE = ARGUMENTS.train DEV_FILE = ARGUMENTS.dev TEST_FILE = ARGUMENTS.test MODEL_FILE = ARGUMENTS.model N_EPOCHS = 5 VOCAB = Vocabulary() VOCAB.fit(TRAIN_FILE) print("> Loading in data") TRAIN = VOCAB.tokenize_conll(ARGUMENTS.train) DEV = VOCAB.tokenize_conll(ARGUMENTS.dev) TEST = VOCAB.tokenize_conll(ARGUMENTS.test) ENCODER = BetaEncodeHandler() print("> Pre-encoding edges") START_TIME = time.time() TRAIN = pre_encode(ENCODER, TRAIN, accumulate_vocab=True) DEV = pre_encode(ENCODER, DEV) TEST = pre_encode(ENCODER, TEST) print(">> Done pre-encoding edges", time.time() - START_TIME) # 5m is completely arbitrary but fits all features for PTB.
def main(): """Main function.""" argparser = argparse.ArgumentParser() argparser.add_argument("--train", required=True) argparser.add_argument("--dev", required=True) argparser.add_argument("--test", required=True) argparser.add_argument("--emb", dest="emb") argparser.add_argument("--epochs", dest="epochs", type=int, default=283) argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True) argparser.add_argument("--model_dest", dest="model_dest", required=True) argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3) argparser.add_argument("--dropout", type=int, default=0.33) arguments, _ = argparser.parse_known_args() # [Data] min_occur_count = 2 train_file = arguments.train dev_file = arguments.dev vocab_destination = arguments.vocab_dest model_destination = arguments.model_dest # [Network] word_dims = 100 tag_dims = 100 lstm_hiddens = 400 mlp_arc_size = 500 mlp_rel_size = 100 lstm_layers = arguments.lstm_layers dropout_emb = arguments.dropout dropout_lstm_input = arguments.dropout dropout_lstm_hidden = arguments.dropout dropout_mlp = arguments.dropout # [Hyperparameters for optimizer] learning_rate = 2e-3 decay = 0.75 decay_steps = 5000 beta_1 = 0.9 beta_2 = 0.9 epsilon = 1e-12 # [Run] batch_scale = 5000 # for scaled batching n_epochs = arguments.epochs vocab = Vocabulary() vocab = vocab.fit(train_file, arguments.emb, min_occur_count) embs = vocab.load_embedding(True) if arguments.emb else None vocab.save(vocab_destination) model = DozatManning( vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, pretrained_embeddings=embs, ) optimizer = dy.AdamTrainer( model.parameter_collection, learning_rate, beta_1, beta_2, epsilon ) # Callbacks custom_learning_update_callback = UpdateParamsCallback( optimizer, learning_rate, decay, decay_steps ) save_callback = ModelSaveCallback(model_destination) callbacks = [custom_learning_update_callback, save_callback] parser = Model( model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab, ) # Prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train( training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks, ) parser.load_from_file(model_destination) metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale) test_uas = metrics["nopunct_uas"] test_las = metrics["nopunct_las"] print() print(metrics) print(">> Test score:", test_uas, test_las)
if arguments.lstm_layers != 3: print( ">> WARNING: running with more or less bilstm layers than origin (%d)" % arguments.lstm_layers) if arguments.embedding_file is None: print(">> WARNING: Running without pretrained embeddings.") if arguments.no_orth_init: print(">> Warning: running without orthogonal initilization on parameters") if arguments.dropout < 0.33: print(">> Warning: running model with less dropout") vocab = Vocabulary() vocab = vocab.fit(train_file, pretrained_embeddings_file, min_occur_count) embs = vocab.load_embedding( variance_normalize=True) if arguments.embedding_file else None # save vocab for reproducing later if vocab_destination: vocab.save(vocab_destination) print("> saving vocab to", vocab_destination) """ """ model = BaseParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init) """ Instantiate custom optimizer """ optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1, beta_2, epsilon) """ Callbacks """