os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])
else:
    word2idx, idx2word, weights = load_word_vectors(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])

checkpoint_name = "Psych_exp_baseline"

state = load_checkpoint(checkpoint_name)

# features, feat_length = load_features(config["data"]["features"])

test_set = ClfDataset(X_test, y_test, word2idx, name="psych_test")
test_lengths = [len(x) for x in test_set.data]
test_sampler = SortedSampler(test_lengths)
test_loader = DataLoader(test_set,
                         sampler=test_sampler,
                         batch_size=config["batch_size"],
                         num_workers=opts.cores,
                         collate_fn=ClfCollate())

model = Classifier(ntokens=weights.shape[0], nclasses=7, **config["model"])
model.load_state_dict(state["model"])

posteriors_list = []
predicted_list = []
#####################################################################
# Load Trained Model
#####################################################################
model.to(device)
def sum_clf_test(dataset,
                 config,
                 opts,
                 transfer=False,
                 output_dir=None,
                 checkpoint_name='scv2_aux_ft_gu_last'):
    opts.name = config["name"]
    X_test, y_test, posts_test, pids, human_summaries = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    ####################################################################
    # Model
    ####################################################################
    ntokens = 70004
    model = SummarizationClassifier(ntokens, len(set([0, 1])),
                                    **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SumClfTrainer(model,
                            None,
                            None, (lm_criterion, clf_criterion),
                            [embed_optimizer, rnn_optimizer, top_optimizer],
                            config,
                            opts.device,
                            valid_loader_train_set=None,
                            unfreeze_embed=config["unfreeze_embed"],
                            unfreeze_rnn=config["unfreeze_rnn"],
                            test_loader=None)

    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    _vocab = trainer.load_checkpoint(name=checkpoint_name, path=None)
    test_set = SUMDataset(X_test,
                          posts_test,
                          y_test,
                          seq_len=config['data']['seq_len'],
                          post_len=config['data']['post_len'],
                          preprocess=preprocessor,
                          vocab=_vocab)
    test_lengths = [len(x) for x in test_set.data]
    test_sampler = SortedSampler(test_lengths)

    # test_loader = DataLoader(test_set, sampler=test_sampler,
    #                          batch_size=config["batch_size"],
    #                          num_workers=opts.cores, collate_fn=SumCollate())

    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=config["batch_size"],
                             num_workers=0,
                             collate_fn=SumCollate())

    trainer.test_loader = test_loader

    _, labels_array, predicted = trainer.test_epoch()

    pids_dic = {}
    if human_summaries is None:
        for x, y, sent, z in zip(y_test, predicted, X_test, pids):
            if z in pids_dic:
                pids_dic[z].append([x, y, sent])
            else:
                pids_dic[z] = [[x, y, sent]]
    else:
        for x, y, sent, z, h_summary in zip(y_test, predicted, X_test, pids,
                                            human_summaries):
            if z in pids_dic:
                pids_dic[z].append([x, y, sent, h_summary])
            else:
                pids_dic[z] = [[x, y, sent, h_summary]]

    # import os
    # if not os.path.exists('{}/ref_abs'.format(output_dir)):
    #     os.mkdir('{}/ref_abs'.format(output_dir))
    # if not os.path.exists('{}/dec'.format(output_dir)):
    #     os.mkdir('{}/dec'.format(output_dir))

    file_index = 0
    all_summaries = []
    for elem_key in pids_dic:
        current_summary = ''
        for pair in pids_dic[elem_key]:
            if pair[1] == 1:
                current_summary += pair[2] + '\n'

        all_summaries.append(current_summary)

    return all_summaries
Пример #3
0
def sent_clf(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, _, X_val, y_val, _ = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = Classifier(ntokens, len(set(train_set.labels)), **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SentClfTrainer(model,
                             train_loader,
                             val_loader, (lm_criterion, clf_criterion),
                             [embed_optimizer, rnn_optimizer, top_optimizer],
                             config,
                             opts.device,
                             valid_loader_train_set=val_loader_train_dataset,
                             unfreeze_embed=config["unfreeze_embed"],
                             unfreeze_rnn=config["unfreeze_rnn"])

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################

    # exp = Experiment(opts.name, config, src_dirs=opts.source,
    #                  output_dir=EXP_DIR)
    # exp.add_metric("ep_loss_lm", "line", "epoch loss lm",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_loss_cls", "line", "epoch loss class",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    # exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
    #
    # exp.add_value("epoch", title="epoch summary")
    # exp.add_value("progress", title="training progress")

    ep_loss_lm = [10000, 10000]
    ep_loss_cls = [10000, 10000]
    ep_f1 = [0, 0]
    ep_acc = [0, 0]
    e_log = 0
    progress = 0
    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(0, config["epochs"]):

        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        # exp.update_metric("ep_loss_lm", train_loss[0], "TRAIN")
        ep_loss_lm[0] = train_loss[0]
        # exp.update_metric("ep_loss_lm", val_loss[0], "VAL")
        ep_loss_lm[1] = val_loss[0]
        # exp.update_metric("ep_loss_cls", train_loss[1], "TRAIN")
        # exp.update_metric("ep_loss_cls", val_loss[1], "VAL")
        ep_loss_cls[0] = train_loss[1]
        ep_loss_cls[1] = val_loss[1]

        # exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
        #                   "TRAIN")
        ep_f1[0] = f1_macro(y_train, y_pred_train)
        # exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        ep_f1[1] = f1_macro(y, y_pred)

        # exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        # exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        ep_acc[0] = acc(y_train, y_pred_train)
        ep_acc[1] = acc(y, y_pred)

        # print('Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
        #     ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1], ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1]
        # ))
        # epoch_log = exp.log_metrics(["ep_loss_lm", "ep_loss_cls","ep_f1", "ep_acc"])
        epoch_log = 'Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
            ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1],
            ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1])
        print(epoch_log)
        # exp.update_value("epoch", epoch_log)
        e_log = epoch_log
        # print('')
        # Save the model if the val loss is the best we've seen so far.
        # if not best_loss or val_loss[1] < best_loss:
        #     best_loss = val_loss[1]
        #     trainer.best_acc = acc(y, y_pred)
        #     trainer.best_f1 = f1_macro(y, y_pred)
        #     trainer.checkpoint(name=opts.name, timestamp=True)
        best_loss = val_loss[1]
        trainer.best_acc = acc(y, y_pred)
        trainer.best_f1 = f1_macro(y, y_pred)
        trainer.checkpoint(name=opts.name, tags=str(epoch))

        # if early_stopping.stop(val_loss[1]):
        #     print("Early Stopping (according to classification loss)....")
        #     break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
def clf_features_baseline_runner(yaml,
                                 word2idx,
                                 idx2word,
                                 weights,
                                 cluster=False):
    if cluster is False:
        from logger.experiment import Experiment

    # torch.manual_seed(0)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

    opts, config = train_options(yaml)
    ####################################################################
    # Data Loading and Preprocessing
    ####################################################################
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset(
        config["data"]["dataset"])

    # load word embeddings
    # if config["data"]["embeddings"] == "wiki.en.vec":
    #     word2idx, idx2word, weights = load_word_vectors_from_fasttext(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])
    # else:
    #     word2idx, idx2word, weights = load_word_vectors(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])

    ####################################################################
    # Linguistic Features Loading and Selection
    ####################################################################
    # Any features/lexicon should be in the form of a dictionary
    # For example: lex = {'word':[0., 1., ..., 0.]}

    # load affect features
    print("Loading linguistic features...")
    # todo: streamline feature loading pipeline
    features, feat_length = load_features(config["data"]["features"])
    # assert ... same len

    # build dataset
    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           word2idx,
                           feat_length=feat_length,
                           features_dict=features)

    print("Building validation dataset...")
    val_set = ClfDataset(X_test,
                         y_test,
                         word2idx,
                         features_dict=features,
                         feat_length=feat_length)

    test_set = ClfDataset(X_test,
                          y_test,
                          word2idx,
                          features_dict=features,
                          feat_length=feat_length)
    train_set.truncate(500)
    val_set.truncate(100)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]
    test_lengths = [len(x) for x in test_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)
    test_sampler = SortedSampler(test_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate_withFeatures())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate_withFeatures())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate_withFeatures())
    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=config["batch_size"],
                             num_workers=opts.cores,
                             collate_fn=ClfCollate_withFeatures())

    ####################################################################
    # Model
    ####################################################################
    model = BaselineConcClassifier(ntokens=weights.shape[0],
                                   nclasses=len(set(train_set.labels)),
                                   feat_size=feat_length,
                                   **config["model"])
    model.word_embedding.embedding.weight = nn.Parameter(
        torch.from_numpy(weights), requires_grad=False)
    model.to(opts.device)
    print(model)

    ####################################################################
    # Count total parameters of model
    ####################################################################
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))

    if config["class_weights"]:
        class_weights = class_weigths(train_set.labels, to_pytorch=True)
        class_weights = class_weights.to(opts.device)

        criterion = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = nn.CrossEntropyLoss()

    clf_parameters = filter(lambda p: p.requires_grad, model.parameters())
    clf_optimizer = Adam(clf_parameters, weight_decay=1e-5)

    ####################################################################
    # Training Pipeline
    ####################################################################
    _outputs = []

    def batch_callback(i_batch, losses, batch_outputs):
        _outputs.append(batch_outputs)

        if trainer.step % config["log_interval"] == 0:
            outputs = list(zip(*_outputs))
            _losses = numpy.array(losses[-config["log_interval"]:]).mean(0)
            exp.update_metric("clf-loss", _losses)
            _y_hat = torch.cat(outputs[0]).max(-1)[1].cpu().data.numpy()
            _y = torch.cat(outputs[1]).cpu().data.numpy()
            f1 = f1_score(_y, _y_hat, average='macro')
            exp.update_metric("f1-train", f1)

            losses_log = exp.log_metrics(["clf-loss", 'f1-train'])
            exp.update_value("progress",
                             trainer.progress_log + "\n" + losses_log)

            # clean lines and move cursor back up N lines
            print("\n\033[K" + losses_log)
            print("\033[F" * (len(losses_log.split("\n")) + 2))

            _outputs.clear()

    # Trainer: responsible for managing the training process
    trainer = ClfTrainer_withFeatures(
        model=model,
        train_loader=train_loader,
        valid_loader=val_loader,
        valid_loader_train_set=val_loader_train_dataset,
        test_loader=test_loader,
        criterion=criterion,
        optimizers=clf_optimizer,
        config=config,
        device=opts.device,
        batch_end_callbacks=None)

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    if cluster is False:
        exp = Experiment(opts.name,
                         config,
                         src_dirs=opts.source,
                         output_dir=EXP_DIR)

        exp.add_metric("ep_loss", "line", "epoch loss", ["TRAIN", "VAL"])
        exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
        exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
        exp.add_metric("ep_pre", "line", "epoch precision", ["TRAIN", "VAL"])
        exp.add_metric("ep_rec", "line", "epoch recall", ["TRAIN", "VAL"])

        exp.add_value("epoch", title="epoch summary", vis_type="text")
        exp.add_value("progress", title="training progress", vis_type="text")

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = Early_stopping("min", config["patience"])

    for epoch in range(config["epochs"]):
        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)

        # Calculate accuracy and f1-macro on the evaluation set
        if cluster is False:
            exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
            exp.update_metric("ep_loss", val_loss.item(), "VAL")

            exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")

            exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
            exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

            exp.update_metric("ep_pre", precision_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_pre", precision_macro(y, y_pred), "VAL")

            exp.update_metric("ep_rec", recall_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_rec", recall_macro(y, y_pred), "VAL")

            print()
            epoch_log = exp.log_metrics(
                ["ep_loss", "ep_f1", "ep_acc", "ep_pre", "ep_rec"])
            print(epoch_log)
            exp.update_value("epoch", epoch_log)

            exp.save()
        else:
            print("epoch: {}, train loss: {}, val loss: {}, f1: {}".format(
                epoch, train_loss.item(), val_loss.item(), f1_macro(y,
                                                                    y_pred)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_val_loss = best_loss
            trainer.acc = acc(y, y_pred)
            trainer.f1 = f1_macro(y, y_pred)
            trainer.precision = precision_macro(y, y_pred)
            trainer.recall = recall_macro(y, y_pred)

            trainer.checkpoint(name=opts.name, verbose=False)

        if early_stopping.stop(val_loss):
            print("Early Stopping...")
            break

        print("\n")

    # return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall
    #################
    # Test
    #################
    _, y_test_, y_test_predicted = trainer.eval_epoch(test_set=True)
    f1_test = f1_macro(y_test_, y_test_predicted)
    acc_test = acc(y_test_, y_test_predicted)
    print("#" * 33)
    print("F1 for test set: {}".format(f1_test))
    print("Accuracy for test set: {}".format(acc_test))
    print("#" * 33)

    return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall, f1_test, acc_test
Пример #5
0
def sent_clf_no_aux(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, X_val, y_val = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Data Loading and Preprocessing
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = NaiveClassifier(ntokens,
                            len(set(train_set.labels)),
                            attention=config["model"]["has_att"],
                            **config["model"])
    model.to(opts.device)

    criterion = nn.CrossEntropyLoss()

    if config["gu"]:

        embed_parameters = filter(lambda p: p.requires_grad,
                                  model.embed.parameters())
        bottom_parameters = filter(lambda p: p.requires_grad,
                                   chain(model.bottom_rnn.parameters()))
        if config["model"]["has_att"]:
            top_parameters = filter(
                lambda p: p.requires_grad,
                chain(model.attention.parameters(),
                      model.classes.parameters()))
        else:
            top_parameters = filter(lambda p: p.requires_grad,
                                    model.classes.parameters())

        embed_optimizer = Adam(embed_parameters)
        rnn_optimizer = Adam(bottom_parameters)
        top_optimizer = Adam(top_parameters)

        # Trainer: responsible for managing the training process
        trainer = SentClfNoAuxTrainer(
            model,
            train_loader,
            val_loader,
            criterion, [embed_optimizer, rnn_optimizer, top_optimizer],
            config,
            opts.device,
            valid_loader_train_set=val_loader_train_dataset,
            unfreeze_embed=config["unfreeze_embed"],
            unfreeze_rnn=config["unfreeze_rnn"])
    else:
        parameters = filter(lambda p: p.requires_grad, model.parameters())

        optimizer = optim.Adam(parameters, lr=config["top_lr"])
        # Trainer: responsible for managing the training process
        trainer = SentClfNoAuxTrainer(
            model,
            train_loader,
            val_loader,
            criterion, [optimizer],
            config,
            opts.device,
            valid_loader_train_set=val_loader_train_dataset)

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    exp = Experiment(opts.name,
                     config,
                     src_dirs=opts.source,
                     output_dir=EXP_DIR)
    exp.add_metric("ep_loss", "line", "epoch loss class", ["TRAIN", "VAL"])
    exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])

    exp.add_value("epoch", title="epoch summary")
    exp.add_value("progress", title="training progress")

    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {"encoder": "bottom_rnn"})
        load_state_dict_subset(model, checkpoint["model"])

    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(1, config["epochs"] + 1):
        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        # Calculate accuracy and f1-macro on the evaluation set
        exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
        exp.update_metric("ep_loss", val_loss.item(), "VAL")
        exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        print()
        epoch_log = exp.log_metrics(["ep_loss", "ep_f1", "ep_acc"])
        print(epoch_log)
        exp.update_value("epoch", epoch_log)

        ###############################################################
        # Unfreezing the model after X epochs
        ###############################################################
        # Save the model if the val loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_acc = acc(y, y_pred)
            trainer.best_f1 = f1_macro(y, y_pred)
            trainer.checkpoint(name=opts.name)

        if early_stopping.stop(val_loss):
            print("Early Stopping (according to cls loss)....")
            break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
Пример #6
0

print("Building training dataset...")
train_set = LMDataset(config["data"]["train_path"], preprocess=preprocessor,
                      vocab=vocab, vocab_size=config["vocab"]["size"],
                      seq_len=config["data"]["seq_len"])

print("Building validation dataset...")
val_set = LMDataset(config["data"]["val_path"], preprocess=preprocessor,
                    seq_len=train_set.seq_len, vocab=train_set.vocab)

src_lengths = [len(x) for x in train_set.data]
val_lengths = [len(x) for x in val_set.data]
train_sampler = BucketBatchSampler(src_lengths, config["batch_size"],
                                   True)
val_sampler = SortedSampler(val_lengths)

train_loader = DataLoader(train_set, batch_sampler=train_sampler,
                          num_workers=opts.cores, collate_fn=LMCollate())
val_loader = DataLoader(val_set, sampler=val_sampler,
                        batch_size=config["batch_size"],
                        num_workers=opts.cores, collate_fn=LMCollate())

####################################################################
# Model
####################################################################
ntokens = len(train_set.vocab)
model = LangModel(ntokens, **config["model"]).to(opts.device)

loss_function = nn.CrossEntropyLoss(ignore_index=0)
parameters = filter(lambda p: p.requires_grad, model.parameters())