示例#1
0
def load_or_create_vocab_and_embs(arguments):

    vocab = Vocabulary(arguments.only_words)

    # load or create vocabulary
    try:
        vocab.load(arguments.vocab_file)
    except:
        if arguments.embs == None:
            vocab = vocab.fit(arguments.train)
        else:
            vocab = vocab.fit(arguments.train, arguments.embs)

        # save vocab for reproducability later
        logging.info("> saving vocab to %s" % (arguments.vocab_file))
        vocab.save(arguments.vocab_file)

    if arguments.embs == None:
        embs = None
    else:
        embs = vocab.load_embedding()
        logging.info('shape %s' % (embs.shape))

    return vocab, embs
示例#2
0
def load_or_create_vocab(arguments):

    vocab = Vocabulary(arguments.only_words)

    # load or create vocabulary
    try:
        vocab.load(arguments.vocab_file)
    except:
        vocab = vocab.fit(arguments.train)

        # save vocab for reproducability later
        logging.info("> saving vocab to %s" % (arguments.vocab_file))
        vocab.save(arguments.vocab_file)

    return vocab
示例#3
0
parser = argparse.ArgumentParser()
parser.add_argument("--train", required=True)
parser.add_argument("--dev", required=True)
parser.add_argument("--test", required=True)
parser.add_argument("--model", required=True)

arguments, unknown = parser.parse_known_args()

TRAIN_FILE = arguments.train
DEV_FILE = arguments.dev
TEST_FILE = arguments.test
MODEL_FILE = arguments.model
n_epochs = 5

vocab = Vocabulary()
vocab.fit(TRAIN_FILE)

print(">> Loading in data")
TRAIN = vocab.tokenize_conll(arguments.train)
DEV = vocab.tokenize_conll(arguments.dev)
TEST = vocab.tokenize_conll(arguments.test)

encoder = BetaEncodeHandler()
print("> pre-encoding edges")
s = time.time()
TRAIN = pre_encode(encoder, TRAIN, accumulate_vocab=True)
DEV = pre_encode(encoder, DEV)
TEST = pre_encode(encoder, TEST)
print(">> done pre-encoding", time.time() - s)

# 5m is completely arbitrary
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train",
        dest="train",
        help="Annotated CONLL train file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--dev",
        dest="dev",
        help="Annotated CONLL dev file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--test",
        dest="test",
        help="Annotated CONLL dev test",
        metavar="FILE",
        required=True,
    )
    parser.add_argument("--epochs", dest="epochs", type=int, default=30)
    parser.add_argument("--tb_dest", dest="tb_dest")
    parser.add_argument("--vocab_dest", dest="vocab_dest")
    parser.add_argument("--model_dest", dest="model_dest", required=True)
    parser.add_argument(
        "--embs", dest="embs", help="pre-trained embeddings file name", required=False
    )
    parser.add_argument(
        "--no_update_pretrained_emb",
        dest="no_update_pretrained_emb",
        help="don't update the pretrained embeddings during training",
        default=False,
        action="store_true",
    )
    parser.add_argument("--patience", dest="patience", type=int, default=-1)

    arguments, unknown = parser.parse_known_args()

    n_epochs = arguments.epochs

    vocab = Vocabulary()
    if arguments.embs:
        vocab = vocab.fit(arguments.train, arguments.embs)
        embs = vocab.load_embedding()
        print("shape", embs.shape)
    else:
        vocab = vocab.fit(arguments.train)
        embs = None

    # save vocab for reproducability later
    if arguments.vocab_dest:
        print("> saving vocab to", arguments.vocab_dest)
        vocab.save(arguments.vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    # instantiate model
    model = DependencyParser(vocab, embs)

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    save_callback = ModelSaveCallback(arguments.model_dest)
    callbacks.append(save_callback)

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        arguments.dev,
        dev_data,
        epochs=n_epochs,
        batch_size=32,
        callbacks=callbacks,
        patience=arguments.patience,
    )
    parser.load_from_file(arguments.model_dest)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=32)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    print(metrics)

    if arguments.tb_dest and tensorboard_logger:
        tensorboard_logger.raw_write("test_UAS", test_UAS)
        tensorboard_logger.raw_write("test_LAS", test_LAS)

    print()
    print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
    print(">>> Test score:", test_UAS, test_LAS)
示例#5
0
parser.add_argument("--epochs", dest="epochs", type=int, default=30)
parser.add_argument("--tb_dest", dest="tb_dest")
parser.add_argument("--vocab_dest", dest="vocab_dest")
parser.add_argument("--model_dest", dest="model_dest", required=True)
parser.add_argument("--embs", dest="embs", help="pre-trained embeddings file name", required=False)
parser.add_argument("--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action='store_true')
parser.add_argument("--patience", dest='patience', type=int, default=-1)
parser.add_argument("--dev_mode", dest='dev_mode', default=False, help='small subset of training examples, for code testing', action='store_true')

arguments, unknown = parser.parse_known_args()

n_epochs = arguments.epochs

vocab = Vocabulary()
if arguments.embs == None:
    vocab = vocab.fit(arguments.train)
    embs = None
else:
    vocab = vocab.fit(arguments.train, arguments.embs)
    embs = vocab.load_embedding()
    print('shape',embs.shape)

# save vocab for reproducability later
if arguments.vocab_dest:
    print("> saving vocab to", arguments.vocab_dest)
    vocab.save(arguments.vocab_dest)

# prep data
print(">> Loading in data")
training_data = vocab.tokenize_conll(arguments.train)
if arguments.dev_mode:
示例#6
0
    ARGPARSER = argparse.ArgumentParser()
    ARGPARSER.add_argument("--train", required=True)
    ARGPARSER.add_argument("--dev", required=True)
    ARGPARSER.add_argument("--test", required=True)
    ARGPARSER.add_argument("--model", required=True)

    ARGUMENTS, UNK = ARGPARSER.parse_known_args()

    TRAIN_FILE = ARGUMENTS.train
    DEV_FILE = ARGUMENTS.dev
    TEST_FILE = ARGUMENTS.test
    MODEL_FILE = ARGUMENTS.model
    N_EPOCHS = 5

    VOCAB = Vocabulary()
    VOCAB.fit(TRAIN_FILE)

    print("> Loading in data")
    TRAIN = VOCAB.tokenize_conll(ARGUMENTS.train)
    DEV = VOCAB.tokenize_conll(ARGUMENTS.dev)
    TEST = VOCAB.tokenize_conll(ARGUMENTS.test)

    ENCODER = BetaEncodeHandler()
    print("> Pre-encoding edges")
    START_TIME = time.time()
    TRAIN = pre_encode(ENCODER, TRAIN, accumulate_vocab=True)
    DEV = pre_encode(ENCODER, DEV)
    TEST = pre_encode(ENCODER, TEST)
    print(">> Done pre-encoding edges", time.time() - START_TIME)

    # 5m is completely arbitrary but fits all features for PTB.
示例#7
0
def main():
    """Main function."""
    argparser = argparse.ArgumentParser()

    argparser.add_argument("--train", required=True)
    argparser.add_argument("--dev", required=True)
    argparser.add_argument("--test", required=True)
    argparser.add_argument("--emb", dest="emb")
    argparser.add_argument("--epochs", dest="epochs", type=int, default=283)
    argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True)
    argparser.add_argument("--model_dest", dest="model_dest", required=True)

    argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3)
    argparser.add_argument("--dropout", type=int, default=0.33)

    arguments, _ = argparser.parse_known_args()

    # [Data]
    min_occur_count = 2
    train_file = arguments.train
    dev_file = arguments.dev
    vocab_destination = arguments.vocab_dest
    model_destination = arguments.model_dest

    # [Network]
    word_dims = 100
    tag_dims = 100
    lstm_hiddens = 400
    mlp_arc_size = 500
    mlp_rel_size = 100
    lstm_layers = arguments.lstm_layers
    dropout_emb = arguments.dropout
    dropout_lstm_input = arguments.dropout
    dropout_lstm_hidden = arguments.dropout
    dropout_mlp = arguments.dropout

    # [Hyperparameters for optimizer]
    learning_rate = 2e-3
    decay = 0.75
    decay_steps = 5000
    beta_1 = 0.9
    beta_2 = 0.9
    epsilon = 1e-12

    # [Run]
    batch_scale = 5000  # for scaled batching
    n_epochs = arguments.epochs

    vocab = Vocabulary()
    vocab = vocab.fit(train_file, arguments.emb, min_occur_count)
    embs = vocab.load_embedding(True) if arguments.emb else None

    vocab.save(vocab_destination)

    model = DozatManning(
        vocab,
        word_dims,
        tag_dims,
        dropout_emb,
        lstm_layers,
        lstm_hiddens,
        dropout_lstm_input,
        dropout_lstm_hidden,
        mlp_arc_size,
        mlp_rel_size,
        dropout_mlp,
        pretrained_embeddings=embs,
    )

    optimizer = dy.AdamTrainer(
        model.parameter_collection, learning_rate, beta_1, beta_2, epsilon
    )

    # Callbacks
    custom_learning_update_callback = UpdateParamsCallback(
        optimizer, learning_rate, decay, decay_steps
    )
    save_callback = ModelSaveCallback(model_destination)
    callbacks = [custom_learning_update_callback, save_callback]

    parser = Model(
        model,
        decoder="cle",
        loss="crossentropy",
        optimizer=optimizer,
        strategy="scaled_batch",
        vocab=vocab,
    )

    # Prep data
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=n_epochs,
        batch_size=batch_scale,
        callbacks=callbacks,
    )

    parser.load_from_file(model_destination)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale)
    test_uas = metrics["nopunct_uas"]
    test_las = metrics["nopunct_las"]

    print()
    print(metrics)
    print(">> Test score:", test_uas, test_las)
示例#8
0
if arguments.lstm_layers != 3:
    print(
        ">> WARNING: running with more or less bilstm layers than origin (%d)"
        % arguments.lstm_layers)

if arguments.embedding_file is None:
    print(">> WARNING: Running without pretrained embeddings.")

if arguments.no_orth_init:
    print(">> Warning: running without orthogonal initilization on parameters")

if arguments.dropout < 0.33:
    print(">> Warning: running model with less dropout")

vocab = Vocabulary()
vocab = vocab.fit(train_file, pretrained_embeddings_file, min_occur_count)
embs = vocab.load_embedding(
    variance_normalize=True) if arguments.embedding_file else None

# save vocab for reproducing later
if vocab_destination:
    vocab.save(vocab_destination)
    print("> saving vocab to", vocab_destination)
""" """
model = BaseParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers,
                   lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                   mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init)
""" Instantiate custom optimizer """
optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1,
                           beta_2, epsilon)
""" Callbacks """