Пример #1
0
def bcn(config, data_file, embeddings, device, dataset, embeddings_type):
    #   extensions : add 2 languages, use a combination of CoVe embeddings (like ELMo)

    name = "test_model"
    torch.manual_seed(123)

    inputs = data.Field(lower=True, include_lengths=True, batch_first=True)
    labels = data.Field(sequential=False, unk_token=None)

    print('Generating train, dev, test splits')

    if dataset == 'IWSLT':
        # using the IWSLT 2016 TED talk translation task
        train, dev, test = datasets.IWSLT.splits(root=data_file,
                                                 exts=['.en', '.de'],
                                                 fields=[inputs, inputs])
    elif dataset == 'SST-2':
        train, dev, test = datasets.SST.splits(
            text_field=inputs,
            label_field=labels,
            root=data_file,
            fine_grained=False,
            train_subtrees=True,
            filter_pred=lambda ex: ex.label != 'neutral')
    elif dataset == 'SST-5':
        train, dev, test = datasets.SST.splits(text_field=inputs,
                                               label_field=labels,
                                               root=data_file,
                                               fine_grained=True,
                                               train_subtrees=True)
    elif dataset == 'IMDB':
        train, test = datasets.IMDB.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file)
        train, dev = train.split(
            split_ratio=0.9,
            stratified=True)  # 0.9 in order to be close to the paper
    elif dataset == 'TREC-6':
        train, test = datasets.TREC.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file,
                                           fine_grained=False)
        train, dev = train.split(split_ratio=0.9, stratified=True)
    elif dataset == 'TREC-50':
        train, test = datasets.TREC.splits(text_field=inputs,
                                           label_field=labels,
                                           root=data_file,
                                           fine_grained=True)
        train, dev = train.split()
    elif dataset == 'SNLI':
        train, dev, test = datasets.SNLI.splits(text_field=inputs,
                                                label_field=labels,
                                                root=data_file)
    else:
        print('Invalid dataset name detected...')
        return

    print('Building vocabulary')
    inputs.build_vocab(train, dev, test)
    inputs.vocab.load_vectors(
        vectors=GloVe(name='840B', dim=300, cache=embeddings))

    labels.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test),
        batch_size=config["train_batch_size"],
        device=torch.device(device) if device >= 0 else None,
        sort_within_batch=True)

    model = BCN(config=config,
                n_vocab=len(inputs.vocab),
                vocabulary=inputs.vocab.vectors,
                embeddings=embeddings,
                num_labels=len(labels.vocab.freqs),
                embeddings_type=embeddings_type)

    bcn_params = [
        p for n, p in model.named_parameters()
        if "mtlstm" not in n and p.requires_grad
    ]

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(bcn_params, lr=0.001)

    if device != -1:
        model.to(device)
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in bcn_params
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))

    #####################################
    # Training Pipeline
    #####################################
    trainer = BCNTrainer(model=model,
                         train_loader=train_iter,
                         valid_loader=dev_iter,
                         criterion=criterion,
                         device="cpu" if device == -1 else 'cuda',
                         config=config,
                         optimizers=[optimizer])

    print('Generating CoVe')

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    exp = Experiment(name, config, src_dirs=None, output_dir=EXP_DIR)
    exp.add_metric("ep_loss", "line", "epoch loss class", ["TRAIN", "VAL"])
    exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])

    exp.add_value("epoch", title="epoch summary")
    exp.add_value("progress", title="training progress")

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(1, config["epochs"] + 1):
        train_loss = trainer.train_epoch()
        print(model.w, model.gama)
        val_loss, y, y_pred = trainer.eval_epoch()

        # Calculate accuracy and f1-macro on the evaluation set
        exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
        exp.update_metric("ep_loss", val_loss.item(), "VAL")
        exp.update_metric("ep_f1", 0, "TRAIN")
        exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        exp.update_metric("ep_acc", 0, "TRAIN")
        exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        print()
        epoch_log = exp.log_metrics(["ep_loss", "ep_f1", "ep_acc"])
        print(epoch_log)
        exp.update_value("epoch", epoch_log)

        # Save the model if the val loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_acc = acc(y, y_pred)
            trainer.best_f1 = f1_macro(y, y_pred)
            trainer.checkpoint(name=name)

        if early_stopping.stop(val_loss):
            print("Early Stopping (according to cls loss)....")
            break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
Пример #2
0
def sent_clf(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, X_val, y_val = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = Classifier(ntokens, len(set(train_set.labels)), **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SentClfTrainer(model,
                             train_loader,
                             val_loader, (lm_criterion, clf_criterion),
                             [embed_optimizer, rnn_optimizer, top_optimizer],
                             config,
                             opts.device,
                             valid_loader_train_set=val_loader_train_dataset,
                             unfreeze_embed=config["unfreeze_embed"],
                             unfreeze_rnn=config["unfreeze_rnn"])

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################

    exp = Experiment(opts.name,
                     config,
                     src_dirs=opts.source,
                     output_dir=EXP_DIR)
    exp.add_metric("ep_loss_lm", "line", "epoch loss lm", ["TRAIN", "VAL"])
    exp.add_metric("ep_loss_cls", "line", "epoch loss class", ["TRAIN", "VAL"])
    exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])

    exp.add_value("epoch", title="epoch summary")
    exp.add_value("progress", title="training progress")

    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(0, config["epochs"]):

        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        exp.update_metric("ep_loss_lm", train_loss[0], "TRAIN")
        exp.update_metric("ep_loss_lm", val_loss[0], "VAL")

        exp.update_metric("ep_loss_cls", train_loss[1], "TRAIN")
        exp.update_metric("ep_loss_cls", val_loss[1], "VAL")

        exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")

        exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        print()
        epoch_log = exp.log_metrics(
            ["ep_loss_lm", "ep_loss_cls", "ep_f1", "ep_acc"])
        print(epoch_log)
        exp.update_value("epoch", epoch_log)

        # Save the model if the val loss is the best we've seen so far.
        if not best_loss or val_loss[1] < best_loss:
            best_loss = val_loss[1]
            trainer.best_acc = acc(y, y_pred)
            trainer.best_f1 = f1_macro(y, y_pred)
            trainer.checkpoint(name=opts.name, timestamp=True)

        if early_stopping.stop(val_loss[1]):
            print("Early Stopping (according to classification loss)....")
            break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
def clf_features_baseline_runner(yaml,
                                 word2idx,
                                 idx2word,
                                 weights,
                                 cluster=False):
    if cluster is False:
        from logger.experiment import Experiment

    # torch.manual_seed(0)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

    opts, config = train_options(yaml)
    ####################################################################
    # Data Loading and Preprocessing
    ####################################################################
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset(
        config["data"]["dataset"])

    # load word embeddings
    # if config["data"]["embeddings"] == "wiki.en.vec":
    #     word2idx, idx2word, weights = load_word_vectors_from_fasttext(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])
    # else:
    #     word2idx, idx2word, weights = load_word_vectors(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])

    ####################################################################
    # Linguistic Features Loading and Selection
    ####################################################################
    # Any features/lexicon should be in the form of a dictionary
    # For example: lex = {'word':[0., 1., ..., 0.]}

    # load affect features
    print("Loading linguistic features...")
    # todo: streamline feature loading pipeline
    features, feat_length = load_features(config["data"]["features"])
    # assert ... same len

    # build dataset
    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           word2idx,
                           feat_length=feat_length,
                           features_dict=features)

    print("Building validation dataset...")
    val_set = ClfDataset(X_test,
                         y_test,
                         word2idx,
                         features_dict=features,
                         feat_length=feat_length)

    test_set = ClfDataset(X_test,
                          y_test,
                          word2idx,
                          features_dict=features,
                          feat_length=feat_length)
    train_set.truncate(500)
    val_set.truncate(100)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]
    test_lengths = [len(x) for x in test_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)
    test_sampler = SortedSampler(test_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate_withFeatures())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate_withFeatures())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate_withFeatures())
    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=config["batch_size"],
                             num_workers=opts.cores,
                             collate_fn=ClfCollate_withFeatures())

    ####################################################################
    # Model
    ####################################################################
    model = BaselineConcClassifier(ntokens=weights.shape[0],
                                   nclasses=len(set(train_set.labels)),
                                   feat_size=feat_length,
                                   **config["model"])
    model.word_embedding.embedding.weight = nn.Parameter(
        torch.from_numpy(weights), requires_grad=False)
    model.to(opts.device)
    print(model)

    ####################################################################
    # Count total parameters of model
    ####################################################################
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))

    if config["class_weights"]:
        class_weights = class_weigths(train_set.labels, to_pytorch=True)
        class_weights = class_weights.to(opts.device)

        criterion = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = nn.CrossEntropyLoss()

    clf_parameters = filter(lambda p: p.requires_grad, model.parameters())
    clf_optimizer = Adam(clf_parameters, weight_decay=1e-5)

    ####################################################################
    # Training Pipeline
    ####################################################################
    _outputs = []

    def batch_callback(i_batch, losses, batch_outputs):
        _outputs.append(batch_outputs)

        if trainer.step % config["log_interval"] == 0:
            outputs = list(zip(*_outputs))
            _losses = numpy.array(losses[-config["log_interval"]:]).mean(0)
            exp.update_metric("clf-loss", _losses)
            _y_hat = torch.cat(outputs[0]).max(-1)[1].cpu().data.numpy()
            _y = torch.cat(outputs[1]).cpu().data.numpy()
            f1 = f1_score(_y, _y_hat, average='macro')
            exp.update_metric("f1-train", f1)

            losses_log = exp.log_metrics(["clf-loss", 'f1-train'])
            exp.update_value("progress",
                             trainer.progress_log + "\n" + losses_log)

            # clean lines and move cursor back up N lines
            print("\n\033[K" + losses_log)
            print("\033[F" * (len(losses_log.split("\n")) + 2))

            _outputs.clear()

    # Trainer: responsible for managing the training process
    trainer = ClfTrainer_withFeatures(
        model=model,
        train_loader=train_loader,
        valid_loader=val_loader,
        valid_loader_train_set=val_loader_train_dataset,
        test_loader=test_loader,
        criterion=criterion,
        optimizers=clf_optimizer,
        config=config,
        device=opts.device,
        batch_end_callbacks=None)

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    if cluster is False:
        exp = Experiment(opts.name,
                         config,
                         src_dirs=opts.source,
                         output_dir=EXP_DIR)

        exp.add_metric("ep_loss", "line", "epoch loss", ["TRAIN", "VAL"])
        exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
        exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
        exp.add_metric("ep_pre", "line", "epoch precision", ["TRAIN", "VAL"])
        exp.add_metric("ep_rec", "line", "epoch recall", ["TRAIN", "VAL"])

        exp.add_value("epoch", title="epoch summary", vis_type="text")
        exp.add_value("progress", title="training progress", vis_type="text")

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = Early_stopping("min", config["patience"])

    for epoch in range(config["epochs"]):
        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)

        # Calculate accuracy and f1-macro on the evaluation set
        if cluster is False:
            exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
            exp.update_metric("ep_loss", val_loss.item(), "VAL")

            exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")

            exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
            exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

            exp.update_metric("ep_pre", precision_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_pre", precision_macro(y, y_pred), "VAL")

            exp.update_metric("ep_rec", recall_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_rec", recall_macro(y, y_pred), "VAL")

            print()
            epoch_log = exp.log_metrics(
                ["ep_loss", "ep_f1", "ep_acc", "ep_pre", "ep_rec"])
            print(epoch_log)
            exp.update_value("epoch", epoch_log)

            exp.save()
        else:
            print("epoch: {}, train loss: {}, val loss: {}, f1: {}".format(
                epoch, train_loss.item(), val_loss.item(), f1_macro(y,
                                                                    y_pred)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_val_loss = best_loss
            trainer.acc = acc(y, y_pred)
            trainer.f1 = f1_macro(y, y_pred)
            trainer.precision = precision_macro(y, y_pred)
            trainer.recall = recall_macro(y, y_pred)

            trainer.checkpoint(name=opts.name, verbose=False)

        if early_stopping.stop(val_loss):
            print("Early Stopping...")
            break

        print("\n")

    # return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall
    #################
    # Test
    #################
    _, y_test_, y_test_predicted = trainer.eval_epoch(test_set=True)
    f1_test = f1_macro(y_test_, y_test_predicted)
    acc_test = acc(y_test_, y_test_predicted)
    print("#" * 33)
    print("F1 for test set: {}".format(f1_test))
    print("Accuracy for test set: {}".format(acc_test))
    print("#" * 33)

    return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall, f1_test, acc_test
Пример #4
0
exp.add_metric("loss", "line")
exp.add_metric("ppl", "line", "perplexity")
exp.add_metric("ep_loss", "line", "epoch loss", ["TRAIN", "VAL"])
exp.add_metric("ep_ppl", "line", "epoch perplexity", ["TRAIN", "VAL"])
exp.add_value("progress", title="training progress")
exp.add_value("epoch", title="epoch summary")

####################################################################
# Training Loop
####################################################################
best_loss = None
for epoch in range(config["epochs"]):
    train_loss = trainer.train_epoch()
    val_loss = trainer.eval_epoch()
    exp.update_metric("ep_loss", train_loss, "TRAIN")
    exp.update_metric("ep_loss", val_loss, "VAL")
    exp.update_metric("ep_ppl", math.exp(train_loss), "TRAIN")
    exp.update_metric("ep_ppl", math.exp(val_loss), "VAL")
    print()
    epoch_log = exp.log_metrics(["ep_loss", "ep_ppl"])
    print(epoch_log)
    exp.update_value("epoch", epoch_log)

    # Save the model if the val loss is the best we've seen so far.
    if not best_loss or val_loss < best_loss:
        best_loss = val_loss
        trainer.checkpoint(name=opts.name)

    print("\n" * 2)