Exemplo n.º 1
0
def batch_predict(model,
                  data_loader,
                  loss_function,
                  conf_matrix,
                  loadtext="Evaluating",
                  timer=None,
                  mean_loss=False,
                  **kwargs):
    n_batches = len(data_loader)
    loss = 0

    model = model.eval()
    with torch.no_grad():
        for batch_n, (X, Y, sent_len) in enumerate(data_loader):
            stderr_print("{} |{}|".format(
                loadtext, util.loadbar(batch_n / (n_batches - 1))),
                         end="\r")
            sys.stdout.flush()

            model.init_hidden(batch_size=len(sent_len))

            Y_h = model(X, sent_len)
            loss += loss_function(Y_h, Y.view(-1))

            pred_tags = Y_h.max(dim=1)[1]
            conf_matrix.add(pred_tags, Y)

            if timer is not None:
                timer.tick()

    stderr_print("\x1b[2K", end="")

    if mean_loss:
        return loss / len(data_loader.dataset)
    return loss
Exemplo n.º 2
0
def predict(model, X, L):

    model = model.eval()

    stderr_print("Predicting ... ", end="")

    model.init_hidden(batch_size=L.size(0))

    Y_h = model(X, L)

    predictions = Y_h.max(dim=1)[1]

    stderr_print("DONE")

    return predictions
Exemplo n.º 3
0
def main():
    """
    Load a trained model from file (-m) and use it to PoS-tag input file.
    If no file is specified, read from stdin.
    :return:
    """

    # parse args (file/stdin, )
    input_file = args.input_file
    model_file = args.model_file
    language = args.language

    # load model
    stderr_print(f"Loading model from {model_file} ... ")
    loaded = torch.load(model_file)

    try:
        model = loaded["model"]
    except KeyError:
        raise Exception("Failed to load model.")

    try:
        emb_file = loaded["emb_file"]
        stderr_print(f"Embedding file: {emb_file}")
    except KeyError:
        raise Exception("No embedding file specified.")

    try:
        tag_file = loaded["tag_file"]
        stderr_print(f"Tag file: {tag_file}")
    except KeyError:
        raise Exception("No tag file specified.")

    try:
        padding_emb = loaded["padding_emb"]
        stderr_print(f"Padding embedding loaded.")
    except KeyError:
        padding_emb = None
        warn(f"No padding embedding specified, defaulting to embedding[0].")

    try:
        unknown_emb = loaded["unknown_emb"]
        stderr_print(f"'Unknown' embedding loaded.")
    except KeyError:
        unknown_emb = None
        warn(f"No 'unknown' embedding specified, defaulting to embedding[1].")

    if language is None:
        try:
            language = loaded["language"]
            stderr_print(f"Language: {language}")
        except KeyError:
            language = "english"
            warn(f"No language specified, defaulting to english.")

    stderr_print("DONE")

    # First we read the word embeddings file
    # This function returns a word-to-index dictionary and the embedding tensor
    stderr_print(f"Loading embeddings from {emb_file} ... ", end="")
    word2i, _, embeddings = datautil.load_embeddings(emb_file)
    if padding_emb is not None:
        embeddings[0] = padding_emb
    if unknown_emb is not None:
        embeddings[1] = unknown_emb
    stderr_print("DONE")

    # Load and index POS tag list
    stderr_print(f"Loading tagset from {tag_file} ... ", end="")
    tag2i, i2tag = datautil.load_postags(tag_file)
    tagset_size = len(tag2i)
    stderr_print("DONE")

    # Read input text from file
    # Read from stdin if no file (-f) is specified
    if input_file is not None:
        stderr_print(f"Reading from {input_file}...")
        fin = open(input_file, "r")
    else:
        stderr_print("Reading from standard input...")
        fin = sys.stdin

    sent_ids, X, L, X_words = datautil.prepare_raw_text(fin,
                                                        word2i,
                                                        pad_id=0,
                                                        unk_id=1,
                                                        language=language)
    sent_ids, X, L = datautil.sort_batch(sent_ids, X, L)

    if input_file is not None:
        fin.close()

    # Predict
    Y_h = predict(model, X, L)

    # Reshape flattened output tensor, match tag labels
    # and pair them with input words
    Y_h = Y_h.view(len(X_words), -1)
    Y_h, L, sent_ids = datautil.sort_batch(Y_h, L, sent_ids, descending=False)
    paired = datautil.pair_words_with_tags(X_words, Y_h, L, i2tag)

    # Print to output in the word_TAG format
    datautil.print_words_with_tags(paired)
Exemplo n.º 4
0
def main():
    # Prepare data

    # First we read the word embeddings file
    # This function returns a word-to-index dictionary and the embedding tensor
    stderr_print("Loading embeddings ... ", end="")
    word2i, _, embeddings = datautil.load_embeddings(dataparams["emb_file"])
    stderr_print("DONE")

    # Load and index POS tag list
    stderr_print("Loading tagset ... ", end="")
    tag2i, i2tag = datautil.load_postags(dataparams["tag_file"])
    hyperparams["tagset_size"] = len(tag2i)
    hyperparams["padding_id"] = 0
    stderr_print("DONE")

    # Read and index datasets, create tensors
    # Each dataset is a tuple: (input_tensor, targets_tensor, sentence_length_tensor)
    stderr_print("Loading datasets ... ", end="")
    train_data = datautil.prepare_data(dataparams["train_file"], word2i, tag2i,
                                       dataparams["input_len"])
    dev_data = datautil.prepare_data(dataparams["dev_file"], word2i, tag2i,
                                     dataparams["input_len"])
    test_data = datautil.prepare_data(dataparams["test_file"], word2i, tag2i,
                                      dataparams["input_len"])

    # Create dataloaders
    # These object will create batches of data
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=hyperparams["batch_size"],
        shuffle=True,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    dev_loader = torch.utils.data.DataLoader(
        dev_data,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    test_loader = torch.utils.data.DataLoader(
        test_data,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    stderr_print("DONE")

    # Set up the model
    hyperparams["loss_function"] = hyperparams["loss_function"](ignore_index=0)
    model = tagger.RNNTagger(embedding_tensor=embeddings, **hyperparams)
    print()
    print("Hyperparameters:")
    print("\n".join([f"{k}: {v}" for k, v in hyperparams.items()]))
    print()
    print("Number of trainable parameters:",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Set up the confusion matrix to record our predictions
    conf_matrix = util.ConfusionMatrix(hyperparams["tagset_size"],
                                       ignore_index=0,
                                       class_dict=i2tag)

    # Train the model
    model, training_log = train(model,
                                train_loader=train_loader,
                                dev_loader=dev_loader,
                                conf_matrix=conf_matrix,
                                **hyperparams,
                                **dataparams)

    # Save model and training log
    torch.save(
        {
            "model": model,
            "emb_file": dataparams["emb_file"],
            "tag_file": dataparams["tag_file"],
            "padding_emb": embeddings[0],
            "unknown_emb": embeddings[1],
            "language": dataparams["language"]
        }, f"{dataparams['output_dir']}/{TIMESTAMP}.model")
    print(f"Best model saved to {dataparams['output_dir']}/{TIMESTAMP}.model")

    if dataparams["save_log"]:
        util.dictlist_to_csv(
            training_log, f"{dataparams['output_dir']}/{TIMESTAMP}-log.csv")
        print(
            f"Training log saved to {dataparams['output_dir']}/{TIMESTAMP}-log.csv"
        )

    # Evaluate model on dev data
    print()
    print("Evaluating on dev data:")
    conf_matrix.reset()
    loss = batch_predict(model,
                         data_loader=dev_loader,
                         conf_matrix=conf_matrix,
                         mean_loss=True,
                         **hyperparams)

    conf_matrix.print_class_stats()
    print(f"Dev set accuracy: {conf_matrix.accuracy():.4f}")
    print(f"Dev set mean loss: {loss:8g}")
    print()
    if dataparams["save_conf_matrix"]:
        conf_matrix.matrix_to_csv(
            f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv")
        print(
            f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv"
        )

    # Evaluate model on test data
    if evaluate_on_test_data:
        print()
        print("Evaluating on test data:")
        conf_matrix.reset()
        loss = batch_predict(model,
                             data_loader=test_loader,
                             conf_matrix=conf_matrix,
                             mean_loss=True,
                             **hyperparams)

        conf_matrix.print_class_stats()
        print(f"Test set accuracy: {conf_matrix.accuracy():.4f}")
        print(f"Test set mean loss: {loss:8g}")
        print()
        if dataparams["save_conf_matrix"]:
            conf_matrix.matrix_to_csv(
                f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv")
            print(
                f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv"
            )
Exemplo n.º 5
0
def train(model, train_loader, dev_loader, number_of_epochs, loss_function,
          optimizer, learning_rate, output_dir, conf_matrix, **kwargs):
    # Initialize the optimizer
    optimizer = optimizer(model.parameters(), lr=learning_rate)

    # Set up variables for logging and timing
    n_train_batches = len(train_loader)
    n_dev_batches = len(dev_loader)
    total_iterations = number_of_epochs * (n_train_batches + n_dev_batches)

    training_log = []
    best_model = (None, None)

    train_confm = conf_matrix.copy()
    dev_confm = conf_matrix.copy()

    timer = util.Timer()
    timer.start()

    print("Training started.")
    print()
    print("epoch\ttr_loss\tva_loss\ttr_acc\tva_acc\ttr_f1\tva_f1")

    for epoch in range(1, number_of_epochs + 1):

        # Switch model to training mode
        model = model.train()
        train_confm.reset()
        train_loss = 0

        # Training minibatch loop
        for batch_n, (X, Y, L) in enumerate(train_loader):
            stderr_print("Epoch {:>3d}: Training   |{}| {}".format(
                epoch, util.loadbar(batch_n / (n_train_batches - 1)),
                timer.remaining(total_iterations)),
                         end="\r")

            # Reset the gradient descent and the hidden layers
            model.zero_grad()
            model.hidden = model.init_hidden(batch_size=len(L))

            # Pass the input X through the network.
            # We also need to pass the lengths vector L
            # since the sentences have different lengths
            Y_h = model(X, L)

            pred_tags = Y_h.max(dim=1)[1]
            train_confm.add(pred_tags, Y)

            # We compute the loss and update the weights with gradient descent
            loss = loss_function(Y_h, Y.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss

            timer.tick()

        train_loss /= len(train_loader.dataset)
        stderr_print("\x1b[2K", end="")

        # Switch model to evaluation mode
        model = model.eval()
        dev_confm.reset()

        # Validation minibatch loop
        # It has the same flow as the training loop above, with the exception
        # of using torch.no_grad() to prevent modifying the weights
        dev_loss = batch_predict(model,
                                 dev_loader,
                                 loss_function,
                                 dev_confm,
                                 loadtext="Evaluating",
                                 timer=timer,
                                 mean_loss=True)

        # Record the results
        results = {
            "epoch": epoch,
            "train_loss": train_loss.item(),
            "dev_loss": dev_loss.item(),
            "train_acc": train_confm.accuracy().item(),
            "dev_acc": dev_confm.accuracy().item(),
            "train_f1": train_confm.f_score(mean=True).item(),
            "dev_f1": dev_confm.f_score(mean=True).item()
        }
        training_log.append(results)
        print("{epoch:d}\t{train_loss:.5f}\t{dev_loss:.5f}\t{train_acc:.4f}\t"
              "{dev_acc:.4f}\t{train_f1:.4f}\t{dev_f1:.4f}".format(**results))

        # Save the current model if has the lowest validation loss
        if best_model[0] is None or best_model[0] > results["dev_loss"]:
            torch.save(model, f"{output_dir}/{TIMESTAMP}.check")
            best_model = (results["dev_loss"], epoch)

    print()
    print("Training finished in {:02d}:{:02d}:{:02d}.".format(
        *timer.since_start()))

    # Load the best model
    if best_model[1] != epoch:
        print(
            f"Loading model with the lowest validation loss (Epoch {best_model[1]})."
        )
        model = torch.load(f"{output_dir}/{TIMESTAMP}.check")

    # Clean up checkpoint file
    os.remove(f"{output_dir}/{TIMESTAMP}.check")

    return model, training_log