'dropout': 0
    },
    'att': {
        'in_size': 2048,
        'hidden_size': 128,
        'heads': 1
    }
}

trained_loc = "../visually_grounded_model/results/caption_model.30"

data_loc = "../prep_data_emotion/cremad/prep_data/cremad_features.h5"

#%%

full_net = audio_rnn_encoder(audio_config)

cap_state = torch.load(trained_loc, map_location=torch.device('cpu'))

for layer in cap_state:
    print(layer)

full_net.load_state_dict(cap_state)

#%%
#Attention part removed

three_layer_net = audio_rnn_sublayers(audio_config)

cap_state = torch.load(trained_loc, map_location=torch.device('cpu'))
示例#2
0
# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x
f_nodes = [node for node in iterate_data(data_file)] 

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, test, val = split_data_flickr(f_nodes, args.split_loc)

############################### Neural network setup #################################################

# network modules
img_net = img_encoder(image_config)
cap_net = audio_rnn_encoder(audio_config)

# Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it.
optimizer = torch.optim.Adam(list(img_net.parameters())+list(cap_net.parameters()), 1)

#plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100, 
#                                                   threshold = 0.0001, min_lr = 1e-8, cooldown = 100)

#step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1)

def create_cyclic_scheduler(max_lr, min_lr, stepsize):
    lr_lambda = lambda iteration: (max_lr - min_lr)*(0.5 * (np.cos(np.pi * (1 + (3 - 1) / stepsize * iteration)) + 1))+min_lr
    cyclic_scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1)
    # lambda function which uses the cosine function to cycle the learning rate between the given min and max rates
    # the function operates between 1 and 3 (so the cos cycles from -1 to -1 ) normalise between 0 and 1 and then press between
    # min and max lr   
示例#3
0
def create_encoders(preset_name):
    if preset_name == 'rnn':
        # create config dictionaries with all the parameters for your encoders
        audio_config = {
            'conv': {
                'in_channels': 39,
                'out_channels': 64,
                'kernel_size': 6,
                'stride': 2,
                'padding': 0,
                'bias': False
            },
            'rnn': {
                'input_size': [64],
                'hidden_size': [1024],
                'n_layers': [4],
                'batch_first': True,
                'bidirectional': True,
                'dropout': 0,
                'max_len': 1024
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 0,
                'n_embs': [],
                'emb_dim': []
            },
            'app_order': [0]
        }
        # calculate the required output size of the image encoder
        out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \
                   audio_config['rnn']['bidirectional'] * audio_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = audio_rnn_encoder(audio_config)

    elif preset_name == 'rnn_VQ':
        # create config dictionaries with all the parameters for your encoders
        audio_config = {
            'conv': {
                'in_channels': 39,
                'out_channels': 64,
                'kernel_size': 6,
                'stride': 2,
                'padding': 0,
                'bias': False
            },
            'rnn': {
                'input_size': [64, 2048],
                'hidden_size': [1024, 1024],
                'n_layers': [1, 3],
                'batch_first': True,
                'bidirectional': True,
                'dropout': 0,
                'max_len': 1024
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 1,
                'n_embs': [64],
                'emb_dim': [2048]
            },
            'app_order': [0, 1, 0],
        }
        # calculate the required output size of the image encoder
        out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \
                   audio_config['rnn']['bidirectional'] * audio_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = audio_rnn_encoder(audio_config)

    elif preset_name == 'conv_VQ':

        audio_config = {
            'conv_init': {
                'in_channels': 39,
                'out_channels': 128,
                'kernel_size': 1,
                'stride': 1,
                'padding': 0
            },
            'conv': {
                'in_channels': [128, 128, 256, 512],
                'out_channels': [128, 256, 512, 1024],
                'kernel_size': [9, 9, 9, 9],
                'stride': [2, 2, 2, 2],
                'n_layers': 4
            },
            'att': {
                'in_size': 1024,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 2,
                'n_embs': [1024, 1024],
                'emb_dim': [128, 256]
            },
            'max_len': 1024,
            'app_order': [0, 1, 0, 1, 0, 0]
        }
        # get the required output size of the img encoder from audio_config
        out_size = audio_config['conv']['out_channels'][-1]
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = conv_VQ_encoder(audio_config)

    elif preset_name == 'conv':

        audio_config = {
            'conv_init': {
                'in_channels': 39,
                'out_channels': 128,
                'kernel_size': 1,
                'stride': 1,
                'padding': 0
            },
            'conv': {
                'in_channels': [128, 128, 256, 512],
                'out_channels': [128, 256, 512, 1024],
                'kernel_size': [9, 9, 9, 9],
                'stride': [2, 2, 2, 2],
                'n_layers': 4
            },
            'att': {
                'in_size': 1024,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 0,
                'n_embs': [],
                'emb_dim': []
            },
            'max_len': 1024,
            'app_order': [0, 0, 0, 0]
        }
        # get the required output size of the img encoder from audio_config
        out_size = audio_config['conv']['out_channels'][-1]
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = conv_VQ_encoder(audio_config)

    return (img_net, cap_net)