def train(self): # initializing some loss functions that will be used criterion = nn.MSELoss() l2_loss = nn.MSELoss() l1_loss = nn.L1Loss() print('Training...') for epoch in range(self.num_epochs): for sample in self.data_loader: # getting each key value of the sample in question (each sample is a dictionary) right_images = sample['face'] onehot = sample['onehot'] raw_wav = sample['audio'] wrong_images = sample['wrong_face'] id_labels = from_onehot_to_int( onehot ) # list with the position of the youtuber which the audio in question belongs # defining the inputs as Variables and allocate them into the GPU right_images = Variable(right_images.float()).cuda() raw_wav = Variable(raw_wav.float()).cuda() wrong_images = Variable(wrong_images.float()).cuda() onehot = Variable(onehot.float()).cuda() id_labels = Variable(id_labels).cuda() # tensor of 64 (num of samples per batch) ones and zeros that will be used to compute D loss. real_labels = torch.ones(right_images.size(0)) fake_labels = torch.zeros(right_images.size(0)) # ======== One sided label smoothing ========== # Helps preventing the discriminator from overpowering the # generator adding penalty when the discriminator is too confident # ============================================= smoothed_real_labels = torch.FloatTensor( Utils.smooth_label( real_labels.numpy(), -0.1)) # so smooth_real_labels will now be 0.9 # allocating the three variables into GPU real_labels = Variable(real_labels).cuda() smoothed_real_labels = Variable(smoothed_real_labels).cuda() fake_labels = Variable(fake_labels).cuda() # ======= # # TRAIN D # # ======= # # setting all the gradients to 0 self.discriminator.zero_grad() # feeding G only with wav file fake_images, z_vector, _ = self.generator(raw_wav) # feeding D with the generated images and z vector whose dimensions will be needed # for the concatenation in the last hidden layer outputs, _ = self.discriminator(fake_images, z_vector) # computing D loss when feeding fake images fake_score = outputs # log file purposes fake_loss = criterion(outputs, fake_labels) # feeding D with the real images and z vector again outputs, activation_real = self.discriminator( right_images, z_vector) # computing D loss when feeding real images real_score = outputs real_loss = criterion(outputs, smoothed_real_labels) # feeding D with real images but not corresponding to the wav under training outputs, _ = self.discriminator(wrong_images, z_vector) # computing D loss when feeding real images but not the ones corresponding to the input audios wrong_loss = criterion(outputs, fake_labels) wrong_score = outputs # the discriminator loss function is the sum of the three of them d_loss = real_loss + fake_loss + wrong_loss d_loss.backward() self.optimD.step() # ======= # # TRAIN G # # ======= # # setting all the gradients to 0 self.generator.zero_grad() # feeding G only with wav file fake_images, z_vector, softmax_scores = self.generator(raw_wav) # feeding D with the generated images and z vector. Storing intermediate layer activations for loss computation purposes outputs, activation_fake = self.discriminator( fake_images, z_vector) # feeding D with the real images and z vector. Storing intermediate layer activations for loss computation purposes _, activation_real = self.discriminator(right_images, z_vector) activation_fake = torch.mean(activation_fake, 0) activation_real = torch.mean(activation_real, 0) # ======= Generator Loss function============ # This is a customized loss function, the first term is the mean square error loss # The second term is feature matching loss, this measure the distance between the real and generated # images statistics by comparing intermediate layers activations # The third term is L1 distance between the generated and real images, this is helpful for the conditional case # because it links the embedding feature vector directly to certain pixel values. # =========================================== # computing first the part of the loss related to the softmax classifier after the embedding softmax_criterion = nn.CrossEntropyLoss() softmax_loss = softmax_criterion(softmax_scores, id_labels) g_loss = criterion(outputs, real_labels) \ + self.l2_coef * l2_loss(activation_fake, activation_real.detach()) \ + self.l1_coef * l1_loss(fake_images, right_images)\ + self.softmax_coef * softmax_loss # we have seen softmax_loss starts around 2 and g_loss around 20... That's why we've scaled by 10 # applying backpropagation and updating parameters. g_loss.backward() self.optimG.step() # store the info in the logger at each epoch self.logger.log_iteration_gan(epoch, d_loss, g_loss, real_score, fake_score, wrong_score) # storing the parameters for every 10 epochs if (epoch) % 10 == 0: Utils.save_checkpoint(self.discriminator, self.generator, self.checkpoints_path, self.save_path, epoch)
def train(self): criterion = nn.MSELoss() l2_loss = nn.MSELoss() l1_loss = nn.L1Loss() print('Training...') for epoch in range(self.num_epochs): for sample in self.data_loader: right_images = sample['face'] onehot = sample['onehot'] raw_wav = sample['audio'] wrong_images = sample['wrong_face'] id_labels = from_onehot_to_int(onehot) right_images = Variable(right_images.float()).cuda() raw_wav = Variable(raw_wav.float()).cuda() wrong_images = Variable(wrong_images.float()).cuda() onehot = Variable(onehot.float()).cuda() id_labels = Variable(id_labels).cuda() real_labels = torch.ones(right_images.size(0)) fake_labels = torch.zeros(right_images.size(0)) smoothed_real_labels = torch.FloatTensor( Utils.smooth_label( real_labels.numpy(), -0.1)) # so smooth_real_labels will now be 0.9 real_labels = Variable(real_labels).cuda() smoothed_real_labels = Variable(smoothed_real_labels).cuda() fake_labels = Variable(fake_labels).cuda() self.discriminator.zero_grad() fake_images, z_vector, _ = self.generator(raw_wav) outputs, _ = self.discriminator(fake_images, z_vector) fake_score = outputs fake_loss = criterion(outputs, fake_labels) outputs, activation_real = self.discriminator( right_images, z_vector) real_score = outputs real_loss = criterion(outputs, smoothed_real_labels) outputs, _ = self.discriminator(wrong_images, z_vector) wrong_loss = criterion(outputs, fake_labels) wrong_score = outputs d_loss = real_loss + fake_loss + wrong_loss d_loss.backward() self.optimD.step() self.generator.zero_grad() fake_images, z_vector, softmax_scores = self.generator(raw_wav) outputs, activation_fake = self.discriminator( fake_images, z_vector) _, activation_real = self.discriminator(right_images, z_vector) activation_fake = torch.mean(activation_fake, 0) activation_real = torch.mean(activation_real, 0) softmax_criterion = nn.CrossEntropyLoss() softmax_loss = softmax_criterion(softmax_scores, id_labels) g_loss = criterion(outputs, real_labels) \ + self.l2_coef * l2_loss(activation_fake, activation_real.detach()) \ + self.l1_coef * l1_loss(fake_images, right_images)\ + self.softmax_coef * softmax_loss g_loss.backward() self.optimG.step() self.logger.log_iteration_gan(epoch, d_loss, g_loss, real_score, fake_score, wrong_score) if (epoch) % 10 == 0: Utils.save_checkpoint(self.discriminator, self.generator, self.checkpoints_path, self.save_path, epoch)