'dropout': 0 }, 'att': { 'in_size': 2048, 'hidden_size': 128, 'heads': 1 } } trained_loc = "../visually_grounded_model/results/caption_model.30" data_loc = "../prep_data_emotion/cremad/prep_data/cremad_features.h5" #%% full_net = audio_rnn_encoder(audio_config) cap_state = torch.load(trained_loc, map_location=torch.device('cpu')) for layer in cap_state: print(layer) full_net.load_state_dict(cap_state) #%% #Attention part removed three_layer_net = audio_rnn_sublayers(audio_config) cap_state = torch.load(trained_loc, map_location=torch.device('cpu'))
# flickr doesnt need to be split at the root node def iterate_data(h5_file): for x in h5_file.root: yield x f_nodes = [node for node in iterate_data(data_file)] # split the database into train test and validation sets. default settings uses the json file # with the karpathy split train, test, val = split_data_flickr(f_nodes, args.split_loc) ############################### Neural network setup ################################################# # network modules img_net = img_encoder(image_config) cap_net = audio_rnn_encoder(audio_config) # Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it. optimizer = torch.optim.Adam(list(img_net.parameters())+list(cap_net.parameters()), 1) #plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100, # threshold = 0.0001, min_lr = 1e-8, cooldown = 100) #step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1) def create_cyclic_scheduler(max_lr, min_lr, stepsize): lr_lambda = lambda iteration: (max_lr - min_lr)*(0.5 * (np.cos(np.pi * (1 + (3 - 1) / stepsize * iteration)) + 1))+min_lr cyclic_scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1) # lambda function which uses the cosine function to cycle the learning rate between the given min and max rates # the function operates between 1 and 3 (so the cos cycles from -1 to -1 ) normalise between 0 and 1 and then press between # min and max lr
def create_encoders(preset_name): if preset_name == 'rnn': # create config dictionaries with all the parameters for your encoders audio_config = { 'conv': { 'in_channels': 39, 'out_channels': 64, 'kernel_size': 6, 'stride': 2, 'padding': 0, 'bias': False }, 'rnn': { 'input_size': [64], 'hidden_size': [1024], 'n_layers': [4], 'batch_first': True, 'bidirectional': True, 'dropout': 0, 'max_len': 1024 }, 'att': { 'in_size': 2048, 'hidden_size': 128, 'heads': 1 }, 'VQ': { 'n_layers': 0, 'n_embs': [], 'emb_dim': [] }, 'app_order': [0] } # calculate the required output size of the image encoder out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \ audio_config['rnn']['bidirectional'] * audio_config['att']['heads'] image_config = { 'linear': { 'in_size': 2048, 'out_size': out_size }, 'norm': True } img_net = img_encoder(image_config) cap_net = audio_rnn_encoder(audio_config) elif preset_name == 'rnn_VQ': # create config dictionaries with all the parameters for your encoders audio_config = { 'conv': { 'in_channels': 39, 'out_channels': 64, 'kernel_size': 6, 'stride': 2, 'padding': 0, 'bias': False }, 'rnn': { 'input_size': [64, 2048], 'hidden_size': [1024, 1024], 'n_layers': [1, 3], 'batch_first': True, 'bidirectional': True, 'dropout': 0, 'max_len': 1024 }, 'att': { 'in_size': 2048, 'hidden_size': 128, 'heads': 1 }, 'VQ': { 'n_layers': 1, 'n_embs': [64], 'emb_dim': [2048] }, 'app_order': [0, 1, 0], } # calculate the required output size of the image encoder out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \ audio_config['rnn']['bidirectional'] * audio_config['att']['heads'] image_config = { 'linear': { 'in_size': 2048, 'out_size': out_size }, 'norm': True } img_net = img_encoder(image_config) cap_net = audio_rnn_encoder(audio_config) elif preset_name == 'conv_VQ': audio_config = { 'conv_init': { 'in_channels': 39, 'out_channels': 128, 'kernel_size': 1, 'stride': 1, 'padding': 0 }, 'conv': { 'in_channels': [128, 128, 256, 512], 'out_channels': [128, 256, 512, 1024], 'kernel_size': [9, 9, 9, 9], 'stride': [2, 2, 2, 2], 'n_layers': 4 }, 'att': { 'in_size': 1024, 'hidden_size': 128, 'heads': 1 }, 'VQ': { 'n_layers': 2, 'n_embs': [1024, 1024], 'emb_dim': [128, 256] }, 'max_len': 1024, 'app_order': [0, 1, 0, 1, 0, 0] } # get the required output size of the img encoder from audio_config out_size = audio_config['conv']['out_channels'][-1] image_config = { 'linear': { 'in_size': 2048, 'out_size': out_size }, 'norm': True } img_net = img_encoder(image_config) cap_net = conv_VQ_encoder(audio_config) elif preset_name == 'conv': audio_config = { 'conv_init': { 'in_channels': 39, 'out_channels': 128, 'kernel_size': 1, 'stride': 1, 'padding': 0 }, 'conv': { 'in_channels': [128, 128, 256, 512], 'out_channels': [128, 256, 512, 1024], 'kernel_size': [9, 9, 9, 9], 'stride': [2, 2, 2, 2], 'n_layers': 4 }, 'att': { 'in_size': 1024, 'hidden_size': 128, 'heads': 1 }, 'VQ': { 'n_layers': 0, 'n_embs': [], 'emb_dim': [] }, 'max_len': 1024, 'app_order': [0, 0, 0, 0] } # get the required output size of the img encoder from audio_config out_size = audio_config['conv']['out_channels'][-1] image_config = { 'linear': { 'in_size': 2048, 'out_size': out_size }, 'norm': True } img_net = img_encoder(image_config) cap_net = conv_VQ_encoder(audio_config) return (img_net, cap_net)