posteriors = torch.cat(posteriors, dim=0)
    predicted = numpy.argmax(posteriors, 1)
    labels = numpy.array(torch.cat(labels, dim=0))
    posteriors_one_list = [
        posteriors.cpu().numpy()[i] for i in range(0, posteriors.shape[0])
    ]
    return labels, predicted, posteriors_one_list, attentions, texts


####################################################################
# Test
####################################################################
psych_yamls = os.path.join(YAML_PATH, 'psychexp')
yaml = os.path.join(psych_yamls, "baseline.yaml")

opts, config = train_options(yaml)
device = opts.device
X_train, y_train, X_test, y_test = load_dataset(config["data"]["dataset"],
                                                test=True)

# load word embeddings
if config["data"]["embeddings"] == "wiki.en.vec":
    word2idx, idx2word, weights = load_word_vectors_from_fasttext(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])
else:
    word2idx, idx2word, weights = load_word_vectors(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])

checkpoint_name = "Psych_exp_baseline"
예제 #2
0
sys.path.append('./')
from models.baseline_trainer import BaselineTrainer
from modules.data.collates import MovieCollate
from modules.data.datasets import TRIPOD
from modules.models import TAM
from sys_config import MODEL_CNF_DIR, DATA_DIR
from utils.generic import number_h
from utils.opts import train_options

####################################################################
# SETTINGS
####################################################################
from utils.training import class_weigths

default_config = os.path.join(MODEL_CNF_DIR, "teacher.yaml")
opts, config = train_options(default_config)

assert config["data"]["encoder"] in ["USE"]

with open(
        os.path.join(
            DATA_DIR,
            "train_TRIPOD_{}.pickle".format(config["data"]["encoder"])),
        "rb") as f:
    train_data = pickle.load(f)

with open(
        os.path.join(DATA_DIR, "test_TRIPOD_{}.pickle".format(
            config["data"]["encoder"])), "rb") as f:
    test_data = pickle.load(f)
예제 #3
0
                    help="add an auxiliary LM loss to the transferred model"
                         "or simply transfer a LM to a classifier"
                         " and fine-tune")
parser.add_argument("-o", "--output_dir", default='./output/forum/',
                    help="The output Directory")
parser.add_argument("-j", "--job", default='Train',
                    help="Train or Test")
parser.add_argument("-tc", "--test_checkpoint_name", default='scv2_aux_ft_gu_last',
                    help="checkpoint to use for testing")

args = parser.parse_args()
input_config = args.input
transfer = args.transfer
aux_loss = args.aux_loss
device_to_use = args.device_to_use
opts, config = train_options(input_config, parser, device_to_use)

if args.job == 'Train':
    test = False
else:
    test = True

output_dir = args.output_dir#'./output/forum/'

test_checkpoint_name = args.test_checkpoint_name

dataset = load_dataset(config, test=test)

# if aux_loss:
if test is True:
    print('Testing..............')
def clf_features_baseline_runner(yaml,
                                 word2idx,
                                 idx2word,
                                 weights,
                                 cluster=False):
    if cluster is False:
        from logger.experiment import Experiment

    # torch.manual_seed(0)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

    opts, config = train_options(yaml)
    ####################################################################
    # Data Loading and Preprocessing
    ####################################################################
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset(
        config["data"]["dataset"])

    # load word embeddings
    # if config["data"]["embeddings"] == "wiki.en.vec":
    #     word2idx, idx2word, weights = load_word_vectors_from_fasttext(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])
    # else:
    #     word2idx, idx2word, weights = load_word_vectors(
    #         os.path.join(EMB_DIR, config["data"]["embeddings"]),
    #         config["data"]["embeddings_dim"])

    ####################################################################
    # Linguistic Features Loading and Selection
    ####################################################################
    # Any features/lexicon should be in the form of a dictionary
    # For example: lex = {'word':[0., 1., ..., 0.]}

    # load affect features
    print("Loading linguistic features...")
    # todo: streamline feature loading pipeline
    features, feat_length = load_features(config["data"]["features"])
    # assert ... same len

    # build dataset
    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           word2idx,
                           feat_length=feat_length,
                           features_dict=features)

    print("Building validation dataset...")
    val_set = ClfDataset(X_test,
                         y_test,
                         word2idx,
                         features_dict=features,
                         feat_length=feat_length)

    test_set = ClfDataset(X_test,
                          y_test,
                          word2idx,
                          features_dict=features,
                          feat_length=feat_length)
    train_set.truncate(500)
    val_set.truncate(100)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]
    test_lengths = [len(x) for x in test_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)
    test_sampler = SortedSampler(test_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate_withFeatures())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate_withFeatures())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate_withFeatures())
    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=config["batch_size"],
                             num_workers=opts.cores,
                             collate_fn=ClfCollate_withFeatures())

    ####################################################################
    # Model
    ####################################################################
    model = BaselineConcClassifier(ntokens=weights.shape[0],
                                   nclasses=len(set(train_set.labels)),
                                   feat_size=feat_length,
                                   **config["model"])
    model.word_embedding.embedding.weight = nn.Parameter(
        torch.from_numpy(weights), requires_grad=False)
    model.to(opts.device)
    print(model)

    ####################################################################
    # Count total parameters of model
    ####################################################################
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))

    if config["class_weights"]:
        class_weights = class_weigths(train_set.labels, to_pytorch=True)
        class_weights = class_weights.to(opts.device)

        criterion = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = nn.CrossEntropyLoss()

    clf_parameters = filter(lambda p: p.requires_grad, model.parameters())
    clf_optimizer = Adam(clf_parameters, weight_decay=1e-5)

    ####################################################################
    # Training Pipeline
    ####################################################################
    _outputs = []

    def batch_callback(i_batch, losses, batch_outputs):
        _outputs.append(batch_outputs)

        if trainer.step % config["log_interval"] == 0:
            outputs = list(zip(*_outputs))
            _losses = numpy.array(losses[-config["log_interval"]:]).mean(0)
            exp.update_metric("clf-loss", _losses)
            _y_hat = torch.cat(outputs[0]).max(-1)[1].cpu().data.numpy()
            _y = torch.cat(outputs[1]).cpu().data.numpy()
            f1 = f1_score(_y, _y_hat, average='macro')
            exp.update_metric("f1-train", f1)

            losses_log = exp.log_metrics(["clf-loss", 'f1-train'])
            exp.update_value("progress",
                             trainer.progress_log + "\n" + losses_log)

            # clean lines and move cursor back up N lines
            print("\n\033[K" + losses_log)
            print("\033[F" * (len(losses_log.split("\n")) + 2))

            _outputs.clear()

    # Trainer: responsible for managing the training process
    trainer = ClfTrainer_withFeatures(
        model=model,
        train_loader=train_loader,
        valid_loader=val_loader,
        valid_loader_train_set=val_loader_train_dataset,
        test_loader=test_loader,
        criterion=criterion,
        optimizers=clf_optimizer,
        config=config,
        device=opts.device,
        batch_end_callbacks=None)

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    if cluster is False:
        exp = Experiment(opts.name,
                         config,
                         src_dirs=opts.source,
                         output_dir=EXP_DIR)

        exp.add_metric("ep_loss", "line", "epoch loss", ["TRAIN", "VAL"])
        exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
        exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
        exp.add_metric("ep_pre", "line", "epoch precision", ["TRAIN", "VAL"])
        exp.add_metric("ep_rec", "line", "epoch recall", ["TRAIN", "VAL"])

        exp.add_value("epoch", title="epoch summary", vis_type="text")
        exp.add_value("progress", title="training progress", vis_type="text")

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = Early_stopping("min", config["patience"])

    for epoch in range(config["epochs"]):
        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)

        # Calculate accuracy and f1-macro on the evaluation set
        if cluster is False:
            exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
            exp.update_metric("ep_loss", val_loss.item(), "VAL")

            exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")

            exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
            exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

            exp.update_metric("ep_pre", precision_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_pre", precision_macro(y, y_pred), "VAL")

            exp.update_metric("ep_rec", recall_macro(y_train, y_pred_train),
                              "TRAIN")
            exp.update_metric("ep_rec", recall_macro(y, y_pred), "VAL")

            print()
            epoch_log = exp.log_metrics(
                ["ep_loss", "ep_f1", "ep_acc", "ep_pre", "ep_rec"])
            print(epoch_log)
            exp.update_value("epoch", epoch_log)

            exp.save()
        else:
            print("epoch: {}, train loss: {}, val loss: {}, f1: {}".format(
                epoch, train_loss.item(), val_loss.item(), f1_macro(y,
                                                                    y_pred)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_val_loss = best_loss
            trainer.acc = acc(y, y_pred)
            trainer.f1 = f1_macro(y, y_pred)
            trainer.precision = precision_macro(y, y_pred)
            trainer.recall = recall_macro(y, y_pred)

            trainer.checkpoint(name=opts.name, verbose=False)

        if early_stopping.stop(val_loss):
            print("Early Stopping...")
            break

        print("\n")

    # return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall
    #################
    # Test
    #################
    _, y_test_, y_test_predicted = trainer.eval_epoch(test_set=True)
    f1_test = f1_macro(y_test_, y_test_predicted)
    acc_test = acc(y_test_, y_test_predicted)
    print("#" * 33)
    print("F1 for test set: {}".format(f1_test))
    print("Accuracy for test set: {}".format(acc_test))
    print("#" * 33)

    return trainer.best_val_loss, trainer.acc, trainer.f1, trainer.precision, trainer.recall, f1_test, acc_test
예제 #5
0
from models.sent_lm_trainer import LMTrainer
from modules.data.collates import LMCollate
from modules.data.datasets import SentenceLMDataset
from modules.data.samplers import SortedSampler, BucketBatchSampler
from modules.data.vocab import Vocab
from modules.modules import SeqReader
from mylogger.experiment import Experiment
from sys_config import EXP_DIR, MODEL_CNF_DIR
from utils.generic import number_h
from utils.opts import train_options
from utils.training import load_checkpoint

####################################################################
# SETTINGS
####################################################################
opts, config = train_options()

####################################################################
# Data Loading and Preprocessing
####################################################################

vocab = None

if config["vocab"]["vocab_path"] is not None:
    vocab_path = config["vocab"]["vocab_path"]
    print(f"Loading vocab from '{vocab_path}'...")
    vocab = Vocab()
    vocab.from_file(vocab_path)

if opts.cp_vocab is not None:
    print(f"Loading vocab from checkpoint '{opts.cp_vocab}'...")
예제 #6
0
from modules.modules import LangModel
from sys_config import EXP_DIR
from utils.datasets import LMDataset, LMCollate, BucketBatchSampler, \
    SortedSampler

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", required=False,
                    default='lm_20m_word.yaml',
                    help="config file of language model")
args = parser.parse_args()
input_config = args.input
####################################################################
# SETTINGS
####################################################################

opts, config = train_options(input_config, parser)

from logger.experiment import Experiment
####################################################################
# Data Loading and Preprocessing
####################################################################

vocab = None
opts.name = config["name"]
preprocessor = None


print("Building training dataset...")
train_set = LMDataset(config["data"]["train_path"], preprocess=preprocessor,
                      vocab=vocab, vocab_size=config["vocab"]["size"],
                      seq_len=config["data"]["seq_len"])