def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): for epoch in range(8000): mse_sum = 0 mse_n = 0 for i, (audio, name) in enumerate(loader): cluster_size = audio.size(1) audio = audio.cuda() audio = (audio * 25 + 50) / 50 time_step = audio.size(2) factor = 32 audio_shuffle = [[] for i in range(time_step // factor)] nums = [x for x in range(time_step // factor)] random.shuffle(nums) for i_n, n in enumerate(nums): sf = random.uniform(0.5, 2) audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor * (n + 1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle, dim=2) audio = audio_shuffle #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step) audio = audio[..., :audio.size(2) // 16 * 16] audio_middile = F.interpolate(audio, scale_factor=1 / 2) audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :] audio_low = F.interpolate(audio_middile, scale_factor=1 / 2) audio_low = audio_low[:, :audio_low.size(1) // 2, :] audio_list = [audio_low, audio_middile, audio] out, out_conversion, enc_content, latent_loss = model(audio, name) recon_loss = 0 for num in range(3): recon_loss += criterion(out[num], audio_list[num]) latent_loss = latent_loss.mean() #print ("recon_loss:", recon_loss) OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, False)], 3) # True), if i % 50 == 0: logger.log_training(iteration=iteration, loss_recon=recon_loss, latent_loss=latent_loss) model.eval() audio, name = next(inf_iterator_test) audio = audio.cuda() audio = (audio * 25 + 50) / 50 out, out_conversion, enc_content, latent_loss = model( audio, name) a = torch.stack([audio[0], out[-1][0], out_conversion[-1][0]], dim=0) a = (a * 50 - 50) / 25 a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]), mel_recon=("image", plot_spectrogram_to_numpy(), out[-1][0]), mel_conversion=("image", plot_spectrogram_to_numpy(), out_conversion[-1][0]), audio_ori=("audio", 22050, a[0]), audio_recon=("audio", 22050, a[1]), audio_conversion=("audio", 22050, a[2]), ) logger.close() save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) model.train() iteration += 1
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): vctk_mean = torch.tensor(np.load("/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/mean.npy")).unsqueeze(0).unsqueeze(2).cuda() vctk_std = torch.tensor(np.load("/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/std.npy")).unsqueeze(0).unsqueeze(2).cuda() for epoch in range(epochs): mse_sum = 0 mse_n = 0 for i, audio in enumerate(loader): cluster_size = audio.size(1) audio = audio.cuda() audio = (audio - vctk_mean)/vctk_std factor = 32 time_step = audio.size(2) audio_shuffle = [[] for i in range (time_step//factor)] nums = [x for x in range(time_step//factor)] random.shuffle(nums) for i_n, n in enumerate(nums): sf = random.uniform(0.5, 2) audio_shuffle[n] = F.interpolate(audio[...,factor*n : factor*(n+1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle,dim=2) audio = audio_shuffle#F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step) audio = audio[...,:audio.size(2)//32*32] audio_middile = F.interpolate(audio, scale_factor= 1/2) audio_middile = audio_middile[:, :audio_middile.size(1)//2, :] audio_low = F.interpolate(audio_middile, scale_factor= 1/2) audio_low = audio_low[:, :audio_low.size(1)//2, :] audio_list = [audio_low, audio_middile, audio] out, out_conversion, enc_content, spk, latent_loss, idx = model(audio) recon_loss = 0 for num in range(3): recon_loss += criterion(out[num], audio_list[num]) latent_loss = latent_loss.mean() #print ("recon_loss:", recon_loss) OptimStep([(model, opt, recon_loss + latent_loss_weight*latent_loss , False)], 3)# True), if i % 200 == 0 : logger.log_training(iteration = iteration, loss_recon = recon_loss, latent_loss = latent_loss) model.eval() audio = next(inf_iterator_test) audio = audio.cuda() audio = (audio - vctk_mean)/vctk_std out, out_conversion, enc_content, spk, latent_loss, idx = model(audio) audio = audio*vctk_std + vctk_mean out[-1] = out[-1]*vctk_std + vctk_mean out_conversion[-1] = out_conversion[-1]*vctk_std+vctk_mean a = torch.stack([audio[0], audio[idx[0]], out[-1][0], out_conversion[-1][0]], dim = 0) a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation(iteration = iteration, mel_ori = ("image", plot_spectrogram_to_numpy(), audio[0]), mel_target = ("image", plot_spectrogram_to_numpy(), audio[idx[0]]), mel_recon = ("image", plot_spectrogram_to_numpy(), out[-1][0]), mel_conversion = ("image", plot_spectrogram_to_numpy(), out_conversion[-1][0]), mel_recon_middle = ("image", plot_spectrogram_to_numpy(), out[-2][0]), mel_conversion_middle = ("image", plot_spectrogram_to_numpy(), out_conversion[-2][0]), mel_recon_low = ("image", plot_spectrogram_to_numpy(), out[-3][0]), mel_conversion_low = ("image", plot_spectrogram_to_numpy(), out_conversion[-3][0]), audio_ori = ("audio", 22050, a[0]), audio_target = ("audio", 22050, a[1]), audio_recon = ("audio", 22050, a[2]), audio_conversion = ("audio", 22050, a[3]), ) logger.close() save_checkpoint(model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen') model.train() iteration += 1
optimize_model(policy_net, batch_log_prob, batch_rewards, optimizer, GAMMA, device=device) # Clear trajectories batch batch_log_prob = [] batch_rewards = [] # Reset Flags if not(render_each_episode): finished_rendering_this_epoch = False # Record stats training_info["epoch mean durations"].append(sum(epoch_durations) / batch_size) training_info["epoch mean rewards"].append(sum(epoch_rewards) / batch_size) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Plot stats plot_durations(training_info["epoch mean rewards"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if (i_epoch) % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, optimizer, i_epoch, learning_rate=learning_rate, **training_info)
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): for epoch in range(epochs): factor = 32 for i, audio in enumerate(loader): time_step = audio.size(2) audio = audio.cuda() audio_shuffle = [[] for i in range(time_step // factor)] nums = [x for x in range(time_step // factor)] random.shuffle(nums) for i_n, n in enumerate(nums): sf = random.uniform(0.5, 2) audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor * (n + 1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle, dim=2) audio = F.interpolate(audio, scale_factor=audio_shuffle.size(2) / time_step) audio = audio[..., :audio.size(2) // 16 * 16] audio_shuffle = audio_shuffle[..., :audio_shuffle.size(2) // 16 * 16] out, out_conversion, enc_content, spk, latent_loss, idx = model( audio, audio_shuffle) recon_loss = criterion( out, audio) #+ criterion(out_conversion, audio_shuffle) latent_loss = latent_loss.mean() OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, False)], 3) # True), if i % 50 == 0: logger.log_training(iteration=iteration, loss_recon=recon_loss, latent_loss=latent_loss) if i % 200 == 0: model.eval() audio = next(inf_iterator_test) audio = audio.cuda() audio_shuffle = [[] for i in range(time_step // factor)] for i_n, n in enumerate(nums): sf = random.uniform(0.5, 1.5) audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor * (n + 1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle, dim=2) audio = F.interpolate(audio, scale_factor=audio_shuffle.size(2) / time_step) audio = audio[..., :audio.size(2) // 16 * 16] audio_shuffle = audio_shuffle[..., :audio_shuffle.size(2) // 16 * 16] out, out_conversion, enc_content, spk, latent_loss, idx = model( audio, audio_shuffle) a = torch.stack([ audio[0], audio_shuffle[idx[0]], out[0], out_conversion[0] ], dim=0) a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.close() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]), mel_target=("image", plot_spectrogram_to_numpy(), audio_shuffle[idx[0]]), mel_recon=("image", plot_spectrogram_to_numpy(), out[0]), mel_conversion=("image", plot_spectrogram_to_numpy(), out_conversion[0]), audio_ori=("audio", 22050, a[0]), audio_target=("audio", 22050, a[1]), audio_recon=("audio", 22050, a[2]), audio_conversion=("audio", 22050, a[3]), ) save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) model.train() iteration += 1
training_info["max reward achieved"]) print("Max TD loss recorded: %f" % training_info["max TD loss recorded"]) print("Max episode loss recorded: %f" % training_info["max episode loss recorded"]) print("Past 100 episodes avg reward: %f \n\n" % training_info["past 100 episodes mean reward"]) # Check if the problem is solved # CartPole standard: average reward for the past 100 episode above 195 if training_info["past 100 episodes mean reward"] > 195: print("\n\n\t Problem Solved !!!\n\n\n") break i_episode += 1 # Update the target network, copying all weights and biases in DQN if i_episode % target_update == 0: target_net.load_state_dict(policy_net.state_dict()) # Every save_ckpt_interval, save a checkpoint according to current i_episode. # Note that we use i_episode + 1 if (i_episode + 1) % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, target_net, optimizer, i_episode + 1, learning_rate=learning_rate, **training_info)
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration, inf_iterator_enc): dis = NetD(80).cuda() opt_dis = optim.Adam(dis.parameters()) ''' gamma = 1.0 lambda_k = 0.01 init_k = 0.0 recorder = BEGANRecorder(lambda_k, init_k, gamma) k = recorder.k.item() ''' opt_dec = optim.Adam(model.dec.parameters()) lj_mean = torch.tensor( np.load("/home/ericwudayi/nas189/homes/ericwudayi/LJSpeech/mean.npy") ).unsqueeze(0).unsqueeze(2).cuda() lj_std = torch.tensor( np.load("/home/ericwudayi/nas189/homes/ericwudayi/LJSpeech/std.npy") ).unsqueeze(0).unsqueeze(2).cuda() vctk_mean = torch.tensor( np.load( "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/mean.npy" )).unsqueeze(0).unsqueeze(2).cuda() vctk_std = torch.tensor( np.load( "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/std.npy" )).unsqueeze(0).unsqueeze(2).cuda() lj_mean = vctk_mean lj_std = vctk_std if args.load_checkpoint == True: dis, opt_dis, iteration = load_checkpoint( f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis', dis, opt_dis) for epoch in range(80000): for i, audio in enumerate(loader): audio = audio.cuda() audio = (audio - lj_mean) / lj_std #audio = (audio*25 + 50) / 50 factor = 32 time_step = audio.size(2) audio_shuffle = [[] for i in range(time_step // factor)] nums = [x for x in range(time_step // factor)] random.shuffle(nums) for i_n, n in enumerate(nums): sf = random.uniform(0.5, 2) audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor * (n + 1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle, dim=2) audio = audio_shuffle #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step) audio = audio[..., :audio.size(2) // 32 * 32] audio_middile = F.interpolate(audio, scale_factor=1 / 2) audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :] audio_low = F.interpolate(audio_middile, scale_factor=1 / 2) audio_low = audio_low[:, :audio_low.size(1) // 2, :] audio_list = [audio_low, audio_middile, audio] out, latent_loss, index_list = model(audio) recon_loss = 0 for num in range(3): recon_loss += criterion(out[num], audio_list[num]) latent_loss = latent_loss.mean() if iteration % 1 == 0: model.zero_grad() audio_enc = next(inf_iterator_enc) audio_enc = audio_enc.cuda() audio_enc = (audio_enc - vctk_mean) / vctk_std if audio_enc.size(0) > audio.size(0): audio_enc = audio_enc[:audio.size(0)] else: audio = audio[:audio_enc.size(0)] audio_enc = F.interpolate(audio_enc, scale_factor=audio.size(2) / audio_enc.size(2)) out_code, latent_loss_enc, index_list = model(audio_enc) #latent_loss += latent_loss_enc.mean() #latent_loss *= 0 #loss_dis, loss_gan = GANLOSS(dis, audio, out_code[-1]) #if iteration%4==0: # OptimStep([(dis, opt_dis, loss_dis, False)],3) #else: # OptimStep([(model, opt, recon_loss + latent_loss_weight*latent_loss + 0.1*loss_gan , False)],3) OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, False)], 3) #else: #latent_loss *= 0 #OptimStep([(model, opt, recon_loss + latent_loss_weight*latent_loss , True)], 3)# True), ################################# # BEGAN TRAINING PHASE # ################################# model.zero_grad() if iteration % 5 == 0: logger.log_training(iteration=iteration, loss_recon=recon_loss, latent_loss=latent_loss) if iteration % 200 == 0: model.eval() a = torch.stack( [audio[0], out[-1][0], out_code[-1][0], audio_enc[0]], dim=0) a = a * lj_std + lj_mean a[3] = (a[3] - lj_mean) / lj_std * vctk_std + vctk_mean image = a a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), image[0]), mel_recon=("image", plot_spectrogram_to_numpy(), image[1]), mel_code=("image", plot_spectrogram_to_numpy(), image[2]), mel_target=("image", plot_spectrogram_to_numpy(), image[3]), audio_ori=("audio", 22050, a[0]), audio_recon=("audio", 22050, a[1]), audio_code=("audio", 22050, a[2]), audio_enc=("audio", 22050, a[3]), ) save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) save_checkpoint( dis, opt_dis, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis' ) model.train() logger.close() iteration += 1
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): for epoch in range(epochs): mse_sum = 0 mse_n = 0 for i, (audio, pitch) in enumerate(loader): audio = audio.cuda().float() pitch = pitch.cuda().float() audio = (audio * 25 + 50) / 50 #Normalize pitch #print (pitch.size()) pitch_non_sil = (pitch > 20) pitch_sil = pitch < 20 pitch_mean_non_sil = torch.sum( pitch * pitch_non_sil) / torch.sum(pitch_non_sil) pitch -= pitch_mean_non_sil #torch.mean(pitch,dim = 1, keepdim = True) pitch = (pitch + 20) / 50 pitch[pitch_sil] = 0.0 #print (pitch[0,:50]) pitch = pitch.unsqueeze(1) audio_middle = F.interpolate(audio, scale_factor=1 / 2) audio_middle = audio_middle[:, :audio_middle.size(1) // 2, :] pitch_middle = F.interpolate(pitch, scale_factor=1 / 2) audio_low = F.interpolate(audio_middle, scale_factor=1 / 2) audio_low = audio_low[:, :audio_low.size(1) // 2, :] pitch_low = F.interpolate(pitch_middle, scale_factor=1 / 2) audio_list = [audio_low, audio_middle, audio] pitch_list = [pitch, pitch_middle, pitch_low] out, out_conversion, enc_content, spk, latent_loss, idx = model( audio, pitch_list) recon_loss = 0 #print (i) for num in range(3): recon_loss += criterion(out[num], audio_list[num]) latent_loss = latent_loss.mean() #print ("recon_loss:", recon_loss) OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, False)], 3) # True), if i % 100 == 0: logger.log_training(iteration=iteration, loss_recon=recon_loss, latent_loss=latent_loss) model.eval() audio, pitch = next(inf_iterator_test) audio = audio.cuda().float() pitch = pitch.cuda().float() audio = (audio * 25 + 50) / 50 pitch_non_sil = (pitch > 20) pitch_sil = pitch < 20 pitch_mean_non_sil = torch.sum( pitch * pitch_non_sil) / torch.sum(pitch_non_sil) pitch -= pitch_mean_non_sil #torch.mean(pitch,dim = 1, keepdim = True) pitch = (pitch + 20) / 50 pitch[pitch_sil] = 0.0 pitch = pitch.unsqueeze(1) pitch_middle = F.interpolate(pitch, scale_factor=1 / 2) pitch_low = F.interpolate(pitch_middle, scale_factor=1 / 2) pitch_list = [pitch, pitch_middle, pitch_low] out, out_conversion, enc_content, spk, latent_loss, idx = model( audio, pitch_list) a = torch.stack([ audio[0], audio[idx[0]], out[-1][0], out_conversion[-1][0] ], dim=0) a = (a * 50 - 50) / 25 a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]), mel_target=("image", plot_spectrogram_to_numpy(), audio[idx[0]]), mel_recon=("image", plot_spectrogram_to_numpy(), out[-1][0]), mel_conversion=("image", plot_spectrogram_to_numpy(), out_conversion[-1][0]), mel_recon_middle=("image", plot_spectrogram_to_numpy(), out[-2][0]), mel_conversion_middle=("image", plot_spectrogram_to_numpy(), out_conversion[-2][0]), mel_recon_low=("image", plot_spectrogram_to_numpy(), out[-3][0]), mel_conversion_low=("image", plot_spectrogram_to_numpy(), out_conversion[-3][0]), audio_ori=("audio", 22050, a[0]), audio_target=("audio", 22050, a[1]), audio_recon=("audio", 22050, a[2]), audio_conversion=("audio", 22050, a[3]), ) logger.close() save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) model.train() iteration += 1
if hp.loss == 'BEGAN': loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(dis_high, singing, fake_singing, k) loss_cycle = criterion(speech_2x, fake_speech).mean() OptimStep([(m, opt, loss_gan + 0.2 * loss_cycle, True), (dis_high, opt_dis, loss_dis, False)], 3) k, convergence = recorder(real_dloss, fake_dloss, update_k=True) if iteration % 5 == 0: if hp.loss == "BEGAN": logger.log_training(iteration = iteration, loss_gan = loss_gan, loss_dis = loss_dis, loss_cycle = loss_cycle, k = k, convergence = convergence) if (iteration % 50 == 0): save_checkpoint(m, opt, iteration, f'checkpoint/{args.checkpoint_path}/gen') save_checkpoint(dis_high, opt_dis, iteration, f'checkpoint/{args.checkpoint_path}/dis') idx = random.randint(0, fake_singing.size(0) - 1) #mel = (mel * std) +mean #z = (z * std) + mean real_audio = melblock.inverse(singing).detach().cpu().numpy() fake_audio = melblock.inverse(fake_singing).detach().cpu().numpy() real_speech_audio = vocoder_speech.inverse(speech).detach().cpu().numpy() #mel = (mel -mean)/ std #z = (z - mean ) / std """ logger work like this: logger only accept image, audio ,scalars type.
def main(): # Directory of Image Data data_dir = in_arg.data_dir train_dir = data_dir + '/train' valid_dir = data_dir + '/valid' test_dir = data_dir + '/test' # transforms for the training, validation, and testing sets data_transforms = transforms.Compose([ transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) ## transforms for the training set using Data Argumentation train_transforms = transforms.Compose([ transforms.RandomRotation(30), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) ## transforms for validation set valid_transforms = data_transforms ## transforms for testing set test_transforms = data_transforms # Load the Datasets with ImageFolder image_datasets = datasets.ImageFolder(data_dir, transform=data_transforms) # Load training Dataset train_datasets = datasets.ImageFolder(train_dir, transform=train_transforms) # Load validation Dataset valid_datasets = datasets.ImageFolder(valid_dir, transform=valid_transforms) # Load test Dataset test_datasets = datasets.ImageFolder(test_dir, transform=test_transforms) # Dataloader and batch size dataloaders = torch.utils.data.DataLoader(image_datasets, batch_size=64, shuffle=True) #Trainloader trainloaders = torch.utils.data.DataLoader(train_datasets, batch_size=64, shuffle=True) #Validloader validloaders = torch.utils.data.DataLoader(valid_datasets, batch_size=64, shuffle=True) #Testloader testloaders = torch.utils.data.DataLoader(test_datasets, batch_size=64, shuffle=True) #Use two Pretrained model models_dict = {} vgg13 = models.vgg13(pretrained=True) vgg16 = models.vgg16(pretrained=True) print("OK1") models_dict = {'vgg13': vgg13, 'vgg16': vgg16} print("OK2") model = models_dict[in_arg.arch] print("OK3") #Frozen the parameters #They can't get Updated during Training #Turning off gradient for param in model.parameters(): param.requires_grad = False # Define our new Classifier using only 1 hidden Layer classifier = nn.Sequential( OrderedDict([('fc1', nn.Linear(25088, in_arg.hidden_units)), ('relu', nn.ReLU()), ('dou', nn.Dropout(p=0.2)), ('fc2', nn.Linear(in_arg.hidden_units, 102)), ('output', nn.LogSoftmax(dim=1))])) ## Update Classifier and check model again model.classifier = classifier # define loss Since we use logsoftmax as output we use negative log likelihood loss criterion = nn.NLLLoss() # define optimizer to update the weights with gradients optimizer = optim.Adam(model.classifier.parameters(), lr=in_arg.learning_rate) epochs = in_arg.epochs # Use GPU if avaliable device = torch.device( "cuda" if torch.cuda.is_available() and in_arg.gpu else "cpu") for epoch in range(epochs): # Train model train_loss, train_accuracy = train_test(0, model, criterion, optimizer, trainloaders, device, in_arg.gpu) # Validate model with torch.no_grad(): valid_loss, valid_accuracy = train_test(1, model, criterion, optimizer, validloaders, device, in_arg.gpu) # print description print("Epoch :{}/{} \n ".format(epoch + 1, epochs)) print("Traning Loss :{} \n ".format(train_loss)) print("Validation Loss :{} \n ".format(valid_loss)) print("Validation Accuracy :{} \n ".format(valid_accuracy)) model.class_to_idx = train_datasets.class_to_idx save_checkpoint(model, optimizer, in_arg)
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): for epoch in range(epochs): mse_sum = 0 mse_n = 0 for i, audio in enumerate(loader): cluster_size = audio.size(1) audio = audio.cuda() audio = (audio - mean) / std / 3 out, out_conversion, enc_content, spk, latent_loss, idx = model( audio) recon_loss = criterion(out, audio) latent_loss = latent_loss.mean() OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, False)], 3) # True), mse_sum += recon_loss.item() * audio.shape[0] mse_n += audio.shape[0] if i % 5 == 0: logger.log_training(iteration=iteration, loss_recon=recon_loss, latent_loss=latent_loss) if i % 200 == 0: model.eval() audio = next(inf_iterator_test) audio = audio.cuda() audio = (audio - mean) / std / 3 out, out_conversion, enc_content, spk, latent_loss, idx = model( audio) a = torch.stack( [audio[0], audio[idx[0]], out[0], out_conversion[0]], dim=0) a = a * std * 3 + mean a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]), mel_target=("image", plot_spectrogram_to_numpy(), audio[idx[0]]), mel_recon=("image", plot_spectrogram_to_numpy(), out[0]), mel_conversion=("image", plot_spectrogram_to_numpy(), out_conversion[0]), audio_ori=("audio", 22050, a[0]), audio_target=("audio", 22050, a[1]), audio_recon=("audio", 22050, a[2]), audio_conversion=("audio", 22050, a[3]), ) logger.close() save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) model.train() iteration += 1
training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Print stats print("\n\n============= Epoch: %d =============" % (i_epoch + 1)) print("epoch mean durations: %f" % (epoch_durations[-1])) print("epoch mean rewards: %f" % (epoch_rewards[-1])) print("Max reward achieved: %f" % training_info["max reward achieved"]) print("value net loss: %f" % value_net_mse) # Plot stats if plot: plot_durations(training_info["epoch mean rewards"], training_info["value net loss"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if i_epoch % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, value_net, policynet_optimizer, valuenet_optimizer, i_epoch, policy_lr=policy_lr, valuenet_lr=valuenet_lr, **training_info)
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration): dis = NetD(80).cuda() opt_dis = optim.Adam(dis.parameters()) gamma = 1.0 lambda_k = 0.01 init_k = 0.0 recorder = BEGANRecorder(lambda_k, init_k, gamma) k = recorder.k.item() opt_dec = optim.Adam(model.dec.parameters()) for epoch in range(epochs): mse_sum = 0 mse_n = 0 for i, audio in enumerate(loader): audio = audio.cuda() audio = (audio * 25 + 50) / 50 factor = 32 time_step = audio.size(2) audio_shuffle = [[] for i in range(time_step // factor)] nums = [x for x in range(time_step // factor)] random.shuffle(nums) for i_n, n in enumerate(nums): sf = random.uniform(0.5, 2) audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor * (n + 1)], scale_factor=sf, mode='nearest') audio_shuffle = torch.cat(audio_shuffle, dim=2) audio = audio_shuffle #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step) audio = audio[..., :audio.size(2) // 16 * 16] audio_middile = F.interpolate(audio, scale_factor=1 / 2) audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :] audio_low = F.interpolate(audio_middile, scale_factor=1 / 2) audio_low = audio_low[:, :audio_low.size(1) // 2, :] audio_list = [audio_low, audio_middile, audio] out, latent_loss, index_list = model(audio) recon_loss = 0 for num in range(3): recon_loss += criterion(out[num], audio_list[num]) latent_loss = latent_loss.mean() #OptimStep([(model, opt, recon_loss + latent_loss_weight*latent_loss , True)], 3)# True), ################################# # BEGAN TRAINING PHASE # ################################# model.zero_grad() index_list_ = [] for l in index_list: idx = torch.randperm(l.size(0)) index_list_ += [l[idx]] out_code = model.index_to_decode(index_list_) loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss( dis, audio, out_code[-1], k) OptimStep([(model, opt, recon_loss + latent_loss_weight * latent_loss, True), (model.dec, opt_dec, 0.2 * (loss_gan), True), (dis, opt_dis, loss_dis, False)], 3) k, convergence = recorder(real_dloss, fake_dloss, update_k=True) iteration += 1 print(iteration) model.zero_grad() if i % 5 == 0: logger.log_training(iteration=iteration, loss_gan=loss_gan, loss_dis=loss_dis, loss_recon=recon_loss, latent_loss=latent_loss, k=k, convergence=convergence) if i % 50 == 0: model.eval() a = torch.stack([audio[0], out[-1][0], out_code[-1][0]], dim=0) a = (a * 50 - 50) / 25 a = vocoder.inverse(a) a = a.detach().cpu().numpy() logger.log_validation( iteration=iteration, mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]), mel_recon=("image", plot_spectrogram_to_numpy(), out[-1][0]), mel_code=("image", plot_spectrogram_to_numpy(), out_code[-1][0]), audio_ori=("audio", 22050, a[0]), audio_recon=("audio", 22050, a[1]), audio_code=("audio", 22050, a[2]), ) save_checkpoint( model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen' ) save_checkpoint( dis, opt_dis, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis' ) model.train() logger.close()