예제 #1
0
 def get_datasets_for(dataset_to_test, offset=(0, 0, 0)):
     input_shape = (100, 110, 120)
     output_shape = (40, 30, 80)
     borders = tuple([(in_ - out_) / 2 for (in_, out_) in zip(input_shape, output_shape)])
     input_slices = tuple([slice(x, x + l) for x, l in zip(offset, input_shape)])
     output_slices = tuple([slice(x + b, x + b + l) for x, b, l in zip(offset, borders, output_shape)])
     numpy_dataset = get_numpy_dataset(dataset_to_test, input_slices, output_slices, True)
     data_loader = DataLoader(1, [dataset_to_test], input_shape, output_shape)
     data_loader.start_refreshing_shared_dataset(0, dataset_index=0, offset=offset)
     dataset, index_of_shared_dataset = data_loader.get_dataset()
     return dataset, numpy_dataset
예제 #2
0
 def get_datasets_for(dataset_to_test, offset=(0, 0, 0)):
     input_shape = (100, 110, 120)
     output_shape = (40, 30, 80)
     borders = tuple([(in_ - out_) / 2
                      for (in_, out_) in zip(input_shape, output_shape)])
     input_slices = tuple(
         [slice(x, x + l) for x, l in zip(offset, input_shape)])
     output_slices = tuple([
         slice(x + b, x + b + l)
         for x, b, l in zip(offset, borders, output_shape)
     ])
     numpy_dataset = get_numpy_dataset(dataset_to_test, input_slices,
                                       output_slices, True)
     data_loader = DataLoader(1, [dataset_to_test], input_shape,
                              output_shape)
     data_loader.start_refreshing_shared_dataset(0,
                                                 dataset_index=0,
                                                 offset=offset)
     dataset, index_of_shared_dataset = data_loader.get_dataset()
     return dataset, numpy_dataset
def classify_subjects_parallel(sub, feature_spaces, model, cv):
    """ Helper function to parallelize analysis across subjects. """
    scores, preds, coefs = [], [], dict()
    for fs in feature_spaces:
        
        if not isinstance(fs, (tuple, list)):
            fs = (fs,)
        
        fs_name = '+'.join(fs)

        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        X, y = dl.return_Xy()

        preds_, scores_, coefs_, model_ = cross_val_predict_and_score(
            estimator=model,
            X=X, y=y,
            cv=cv,
            scoring=tjur_score,
            between_sub=False,
            soft=True
        )
        joblib.dump(model_, f'models/sub-{sub}_analysis-within_split-train_fs-{fs_name}_model.jl')

        dl.log.warning(f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
        scores_df = pd.DataFrame(scores_, columns=['score'])
        scores_df['feature_set'] = fs_name
        scores_df['emotion'] = dl.le.classes_
        scores_df['sub'] = sub
        scores.append(scores_df)

        for i in range(len(preds_)):
            preds_[i]['feature_set'] = fs_name
            preds_[i]['sub'] = sub
            preds_[i]['rep'] = i
        
        preds.append(pd.concat(preds_, axis=0))

        coefs_df = pd.DataFrame(data=coefs_, columns=X.columns)
        coefs_df['feature_set'] = fs_name
        coefs_df['emotion'] = dl.le.classes_
        coefs_df['sub'] = sub
        coefs[fs_name] = coefs_df

    scores = pd.concat(scores, axis=0)
    preds = pd.concat(preds, axis=0)
    return preds, scores, coefs
def classify_fs_parallel(subs, fs, model, cv):
    """ Helper function to parallelize analysis across FSs. """

    if not isinstance(fs, (tuple, list)):
        fs = (fs, )

    fs_name = '+'.join(fs)

    X, y = [], []
    for sub in subs:
        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        this_X, this_y = dl.return_Xy()
        X.append(this_X)
        y.append(this_y)

    X = pd.concat(X, axis=0)
    y = pd.concat(y, axis=0)

    preds_, scores_, coefs_, model_ = cross_val_predict_and_score(
        estimator=model, X=X, y=y, cv=cv, scoring=tjur_score, soft=True)

    joblib.dump(
        model_,
        f'models/sub-{sub}_analysis-between_split-train_fs-{fs_name}_model.jl')

    dl.log.warning(
        f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
    scores = pd.DataFrame(scores_, columns=['score'])
    scores['feature_set'] = fs_name
    scores['emotion'] = dl.le.classes_
    scores['sub'] = sub

    for i in range(len(preds_)):
        preds_[i]['feature_set'] = fs_name
        preds_[i]['rep'] = i

    preds = pd.concat(preds_, axis=0)

    coefs = pd.DataFrame(data=coefs_, columns=X.columns)
    coefs['feature_set'] = fs_name
    coefs['emotion'] = dl.le.classes_

    return preds, scores, coefs
예제 #5
0
from data_io import DataLoader

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--base_dir",
                    default="/fs/project/PAS1315/group1_chime2_data")
args = parser.parse_args()

train_loader = DataLoader(
    base_dir=args.base_dir,
    in_frame_file=
    "data-spectrogram/train_si84_delta_noisy_global_normalized/feats.scp.mod",
    out_frame_file=
    "data-spectrogram/train_si84_clean_global_normalized/feats.scp.mod",
    batch_size=1024,
    buffer_size=10,
    context=5,
    out_frame_count=1,
    shuffle=True,
)

test_loader = DataLoader(
    base_dir=args.base_dir,
    in_frame_file=
    "data-spectrogram/dev_dt_05_delta_noisy_global_normalized/feats.scp.mod",
    out_frame_file=
    "data-spectrogram/dev_dt_05_clean_global_normalized/feats.scp.mod",
    batch_size=1024,
    buffer_size=10,
    context=5,
예제 #6
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_dir", default="/fs/project/PAS1315/group1_chime2_data")
    parser.add_argument("--units", default=2048)
    parser.add_argument("--layers", default=2)
    parser.add_argument("--dropout", default=0.3)
    parser.add_argument("--batch_size", default=1024)
    parser.add_argument("--buffer_size", default=10)
    parser.add_argument("--context", default=5)
    args = parser.parse_args()

    train_loader = DataLoader(
        base_dir = args.base_dir,
        in_frame_file = "data-spectrogram/train_si84_delta_noisy_global_normalized/feats.scp.mod",
        out_frame_file = "data-spectrogram/train_si84_clean_global_normalized/feats.scp.mod",
        batch_size = args.batch_size,
        buffer_size = args.buffer_size,
        context = args.context,
        out_frame_count = 1,
        shuffle = True,
    )

    dev_loader = DataLoader(
        base_dir = args.base_dir,
        in_frame_file = "data-spectrogram/dev_dt_05_delta_noisy_global_normalized/feats.scp.mod",
        out_frame_file = "data-spectrogram/dev_dt_05_clean_global_normalized/feats.scp.mod",
        batch_size = args.batch_size,
        buffer_size = args.buffer_size,
        context = args.context,
        out_frame_count = 1,
        shuffle = False,
    )
from sklearn.preprocessing import OneHotEncoder

sys.path.append('src')
from data_io import DataLoader
from metrics import tjur_score

ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(np.arange(6)[:, np.newaxis])

scores_all = []
for api in ['google', 'azure']:
    df = pd.read_csv(f'data/api-{api}_emoratings.tsv', sep='\t', index_col=0)
    subs = [str(s).zfill(2) for s in range(1, 14) if s != 11]
    scores = np.zeros((len(subs), 6))
    for i, sub in enumerate(subs):
        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        y_api = df.loc[dl.y.index].values
        y_true = ohe.transform(dl.y.values[:, np.newaxis])
        scores[i, :] = tjur_score(y_true, y_api, average=None)

    scores = pd.DataFrame(scores, columns=dl.le.classes_,
                          index=subs).reset_index()
    scores = pd.melt(scores,
                     id_vars='index',
                     value_name='score',
                     var_name='emotion')
    scores = scores.rename({'index': 'sub'}, axis=1)
    scores['api'] = api
    scores_all.append(scores)
import os.path as op
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, recall_score

sys.path.append(op.abspath(op.dirname(op.dirname(__file__))))
from data_io import DataLoader
from noise_ceiling import compute_noise_ceiling
from metrics import brier_score, tjur_score


subs = [str(s).zfill(2) for s in range(1, 14) if s != 11]
ceilings = np.zeros((len(subs), 6))
y_all = []
for i, sub in enumerate(subs):
    dl = DataLoader(sub=sub, log_level=30)
    y_doubles = dl.load_y(return_doubles=True)
    ceilings[i, :] = compute_noise_ceiling(y_doubles, soft=True, scoring=tjur_score)
    dl.log.warning(f"Ceiling sub-{sub}: {ceilings[i, :]}")

    # Note to self: between-subject NC only works with 'hard' labels,
    # otherwise you need to deal with two sources of "doubles"/inconsistency
    dl.load_y(return_doubles=False, strategy_doubles='hard')
    y_all.append(dl.y)

# Ceilings per subject
ceilings = pd.DataFrame(ceilings, columns=dl.le.classes_, index=subs)

# Ceiling across subjects
y = pd.concat(y_all, axis=0)
pd.get_dummies(y).to_csv('results/y_all.tsv', sep='\t')
예제 #9
0
from data_io import DataLoader
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.optim import Adam
from feedforward import *
from constants import FEATURE_LENGTH, MODEL_CHOICE, CUDA, INNER_EMB_SIZE, NUM_EPOCHS, log_constants

log_constants()

train_loader = DataLoader(
    base_dir        = "/fs/project/PAS1315/group1_chime2_data",
    in_frame_file   = "data-spectrogram/train_si84_delta_noisy/feats.scp",
    out_frame_file  = "data-fbank/train_si84_clean/feats.scp",
    batch_size      = 128,
    buffer_size     = 10,
    context         = 5,
    out_frame_count = 1,
    shuffle         = True)

dev_loader = DataLoader(
    base_dir        = "/fs/project/PAS1315/group1_chime2_data",
    in_frame_file   = "data-spectrogram/dev_dt_05_delta_noisy/feats.scp.mod",
    out_frame_file  = "data-fbank/dev_dt_05_clean/feats.scp.mod",
    batch_size      = 128,
    buffer_size     = 10,
    context         = 5,
    out_frame_count = 1,
    shuffle         = False)
예제 #10
0
def run_training():
    """ Define our model and train it """

    # Create models if its been pretrained, or we're training it
    load_generator = a.generator_pretrain is not None
    train_generator = a.generator_checkpoints is not None

    load_teacher = a.teacher_pretrain is not None

    load_student = a.student_pretrain is not None
    train_student = a.student_checkpoints is not None

    models = {}

    with tf.Graph().as_default():

        # Define our generator model
        if load_generator or train_generator:
            with tf.variable_scope('generator'):
                #noisy_inputs = tf.placeholder(tf.float32, [None, a.channels, None, a.input_featdim], name='noisy')
                noisy_inputs = tf.placeholder(tf.float32, [1, 1, None, a.input_featdim], name='noisy')

                output_type = a.loss_weight.keys() & ['fidelity', 'masking', 'map-as-mask-mimic']

                if a.generator_model == 'resnet':
                    generator = ResNet(
                        inputs      = noisy_inputs,
                        output_dim  = a.output_featdim,
                        output_type = output_type,
                        fc_nodes    = a.gunits,
                        fc_layers   = a.glayers,
                        filters     = a.gfilters,
                        dropout     = a.dropout,
                        framewise   = True,
                        #addin       = True,
                    )
                elif a.generator_model == 'dnn':
                    from dnn import DNN
                    generator = DNN(
                        inputs     = noisy_inputs,
                        output_dim = a.output_featdim,
                        output_type = output_type,
                    )
            generator_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator')
            load_generator_vars = [var for var in generator_vars if '_scale' not in var.op.name and '_shift' not in var.op.name]
            generator_loader = tf.train.Saver(load_generator_vars)
            generator_saver = tf.train.Saver(generator_vars)
            models['generator'] = {'model': generator, 'train': train_generator, 'vars': generator_vars}

        if load_teacher:
            with tf.variable_scope('teacher'):
                #clean_inputs = tf.placeholder(tf.float32, [None, a.channels, None, a.output_featdim], name='clean')
                clean_inputs = tf.placeholder(tf.float32, [1, 1, None, a.output_featdim], name='clean')
                teacher = ResNet(
                    inputs     = clean_inputs,
                    output_dim = a.senones,
                    fc_nodes   = a.tunits,
                    fc_layers  = a.tlayers,
                    filters    = a.tfilters,
                    dropout    = 0,
                    framewise  = a.framewise_mimic,
                    #conv_1d    = True,
                )
            teacher_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='teacher')
            teacher_saver = tf.train.Saver({'mimic' + var.op.name[7:]: var for var in teacher_vars})
            models['teacher'] = {'model': teacher, 'train': False, 'vars': teacher_vars}

        # Define critic for generating outputs
        if load_student or train_student:
            if load_generator or train_generator:
                inputs = generator.outputs
            else:
                #inputs = tf.placeholder(tf.float32, [None, a.channels, None, a.input_featdim], name='clean')
                inputs = tf.placeholder(tf.float32, [1, 1, None, a.input_featdim], name='clean')

            with tf.variable_scope('mimic'):
                if a.student_model == 'resnet':
                    student =  ResNet(
                        inputs     = inputs,
                        output_dim = a.senones,
                        fc_nodes   = a.sunits,
                        fc_layers  = a.slayers,
                        filters    = a.sfilters,
                        dropout    = a.dropout,
                        framewise  = a.framewise_mimic,
                    )
                elif a.student_model == 'lstm':
                    from lstm import BiLSTM
                    student = BiLSTM(
                        inputs = inputs,
                        output_shape = [-1, 1, a.characters],
                    )
            student_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mimic')
            student_saver = tf.train.Saver(student_vars)
            models['student'] = {'model': student, 'train': train_student, 'vars': student_vars}

        flists = []
        for flist in [
            ('clean', 'json', a.clean_flist),
            ('noisy', 'json', a.noisy_flist),
            ('noise', 'json', a.noise_flist),
            ('numpy', 'json', a.numpy_flist),
            ('clean', 'scp', a.clean_scp),
            ('noisy', 'scp', a.noisy_scp),
            ('senone', 'txt', a.senone_file),
            ('trans', 'txt', a.trans_file),
        ]:
            if flist[-1] is not None:
                flists.append(flist)

        for loss_type in ['masking', 'map-as-mask-mimic', 'fidelity']:
            if loss_type in a.loss_weight and a.loss_weight[loss_type] == 0:
                del a.loss_weight[loss_type]

        # Create loader for train data
        train_loader = DataLoader(
            base_dir    = a.base_directory,
            flists      = flists,
            stage       = 'tr',
            shuffle     = True,
            channels    = a.channels,
            compute_irm = 'masking' in a.loss_weight,
            logify      = a.logify,
        )

        # Create loader
        dev_loader = DataLoader(
            base_dir    = a.base_directory,
            flists      = flists,
            stage       = 'dt',
            shuffle     = False,
            channels    = a.channels,
            compute_irm = 'masking' in a.loss_weight,
            logify      = a.logify,
        )

        trainer = Trainer(
            models      = models,
            learn_rate  = a.learn_rate,
            lr_decay    = a.lr_decay,
            loss_weight = a.loss_weight,
            batch_size  = a.batch_size,
        )

        # Begin session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Load critic weights, generator weights and initialize trainer weights
        #if a.generator_pretrain and a.model_file:
        #    generator_saver.restore(sess, os.path.join(a.generator_pretrain, a.model_file))

        if a.generator_pretrain:
            sess.run(tf.variables_initializer(generator.scale_vars))
            generator_loader.restore(sess, tf.train.latest_checkpoint(a.generator_pretrain))
        elif train_generator:
            sess.run(tf.variables_initializer(generator_vars))

        # Load teacher
        if a.teacher_pretrain:
            #ckpt = tf.train.latest_checkpoint(a.teacher_pretrain)
            #from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
            #print_tensors_in_checkpoint_file(ckpt, all_tensors=False, tensor_name='', all_tensor_names=True)
            teacher_saver.restore(sess, tf.train.latest_checkpoint(a.teacher_pretrain))
        
        # Load student
        if a.student_pretrain:
            if a.student_file:
                student_saver.restore(sess, os.path.join(a.student_pretrain, a.student_file))
            else:
                student_saver.restore(sess, tf.train.latest_checkpoint(a.student_pretrain))
        elif train_student:
            sess.run(tf.variables_initializer(student_vars))

        # Perform training
        min_loss = float('inf')
        for epoch in range(1, 200):
            print('Epoch %d' % epoch)

            # Run train ops
            losses, duration = trainer.run_ops(sess, train_loader, training = True, epoch = epoch)
            for loss in a.loss_weight:
                print('{} loss: {:.6f}'.format(loss, losses[loss]))
            print('Train loss: %.6f (%.3f sec)' % (losses['average'], duration))

            # Run eval ops
            losses, duration = trainer.run_ops(sess, dev_loader, training = False)
            eval_loss = losses['average']
            for loss in a.loss_weight:
                print('{} loss: {:.6f}'.format(loss, losses[loss]))
            print('Eval loss: %.6f (%.3f sec)\n' % (eval_loss, duration))

            if 'cer' in losses:
                eval_loss = losses['cer']

            # Save if we've got the best loss so far
            if eval_loss < min_loss:
                min_loss = eval_loss
                if a.generator_checkpoints:
                    save_file = os.path.join(a.generator_checkpoints, "model-{0:.4f}.ckpt".format(eval_loss))
                    save_path = generator_saver.save(sess, save_file, global_step = epoch)

                if a.student_checkpoints:
                    save_file = os.path.join(a.student_checkpoints, "model-{0:.4f}.ckpt".format(eval_loss))
                    save_path = student_saver.save(sess, save_file, global_step = epoch)
def classify_subjects_parallel(sub, subs, feature_spaces, model, cv):
    """ Helper function to parallelize analysis across subjects. """
    scores, coefs = [], dict()
    for i, fs in enumerate(feature_spaces):

        if not isinstance(fs, (tuple, list)):
            fs = (fs, )

        fs_name = '+'.join(fs)

        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        X_val, y_val = dl.return_Xy()

        other_X, other_y = [], []
        other_subs = [s for s in subs if s != sub]
        for other_sub in other_subs:
            dl = DataLoader(sub=other_sub, log_level=30)
            dl.load_y(strategy_doubles='hard')
            dl.load_X(feature_set=fs, n_comp=100)
            this_X, this_y = dl.return_Xy()
            other_X.append(this_X)
            other_y.append(this_y)

        X = pd.concat(other_X, axis=0)
        y = pd.concat(other_y, axis=0)

        scores_, coefs_, model_ = cross_val_predict_and_score(
            estimator=model,
            X=X,
            y=y,
            cv=cv,
            scoring=roc_auc_score_per_class,
            X_val=X_val,
            y_val=y_val,
            per_class=True,
            return_model=True)
        joblib.dump(model_,
                    f'models/sub-{sub}_type-between_fs-{fs_name}_model.jl')

        dl.log.warning(
            f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
        scores_df = pd.DataFrame(scores_, columns=['score'])
        scores_df['feature_set'] = fs_name
        scores_df['emotion'] = dl.le.classes_
        scores_df['sub'] = sub
        scores.append(scores_df)

        coefs_df = pd.DataFrame(data=coefs_, columns=X.columns)
        coefs_df['feature_set'] = fs_name
        coefs_df['emotion'] = dl.le.classes_
        coefs_df['sub'] = sub
        coefs[fs_name] = coefs_df

    scores_df = pd.concat(scores, axis=0)
    return scores_df, coefs
예제 #12
0
    parser.add_argument("--context", default=5, type=int)
    parser.add_argument("--frequencies", default=257, type=int)

    # Training params
    parser.add_argument("--batch_size", default=256, type=int)
    parser.add_argument("--buffer_size", default=40, type=int)
    parser.add_argument("--learn_rate", default=0.0001, type=float)
    parser.add_argument("--lr_decay", default=0.95, type=float)
    args = parser.parse_args()

    # Training data dataloader
    train_loader = DataLoader(
        base_dir=args.base_dir,
        in_frame_file=args.noisy_train_file,
        out_frame_file=args.clean_train_file,
        batch_size=args.batch_size,
        buffer_size=args.buffer_size,
        context=args.context,
        out_frame_count=1,
        shuffle=True,
    )

    # Development set dataloader
    dev_loader = DataLoader(
        base_dir=args.base_dir,
        in_frame_file=args.noisy_dev_file,
        out_frame_file=args.clean_dev_file,
        batch_size=args.batch_size,
        buffer_size=args.buffer_size,
        context=args.context,
        out_frame_count=1,
        shuffle=False,
예제 #13
0
def run_training(a):
    """ Define our model and train it """

    # Create directory for saving models
    if not os.path.isdir(a.save_dir):
        os.makedirs(a.save_dir)

    with tf.Graph().as_default():
        shape = (None, 2*a.context + 1, a.frequencies)
        frame_placeholder = tf.placeholder(tf.float32, shape=shape, name="frame_placeholder")

        # Define our critic model
        with tf.variable_scope('critic'):
            critic = Critic(
                inputs      = frame_placeholder,
                output_size = a.senones,
                filters     = a.filters,
                fc_layers   = a.fc_layers,
                fc_nodes    = a.fc_nodes,
                dropout     = a.dropout,
            )

        # Create loader for train data
        train_loader = DataLoader(
            base_dir    = a.base_dir,
            frame_file  = a.clean_train,
            senone_file = a.senone_train,
            batch_size  = a.batch_size,
            buffer_size = a.buffer_size,
            context     = a.context,
            out_frames  = 1,
            shuffle     = True,
        )

        # Create loader for test data
        dev_loader = DataLoader(
            base_dir    = a.base_dir,
            frame_file  = a.clean_dev,
            senone_file = a.senone_dev,
            batch_size  = a.batch_size,
            buffer_size = a.buffer_size,
            context     = a.context,
            out_frames  = 1,
            shuffle     = False,
        )

        # Class for training
        with tf.variable_scope('trainer'):
            trainer = Trainer(critic, learn_rate=a.learn_rate, lr_decay=a.lr_decay, a.max_global_norm, alpha=0.1)

        # Save all variables
        saver = tf.train.Saver()

        # Begin session
        init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess = tf.Session()
        sess.run(init)

        # Perform training
        min_loss = float('inf')
        for epoch in range(1, 200):
            print('Epoch %d' % epoch)

            loss, duration = trainer.run_ops(sess, train_loader, training = True)
            train_loss = loss['avg_loss']
            print ('\nTrain loss: %.6f (%.3f sec)' % (train_loss, duration))

            loss, duration = trainer.run_ops(sess, dev_loader, training = False)
            eval_loss = loss['avg_loss']
            print('\nEval loss: %.6f (%.3f sec)' % (eval_loss, duration))

            if eval_loss < min_loss:
                min_loss = eval_loss
                save_file = os.path.join(a.save_dir, "model-%.4f.ckpt" % eval_loss)
                save_path = saver.save(sess, save_file, global_step=epoch)