def test(self): self.mfcc_encoder.eval() self.lip_feature_encoder.eval() self.input_label = Variable(self.input_label, volatile=True) self.audios_dis = Variable(self.audio_pred_data, volatile=True) self.video_dis = Variable(self.video_pred_data, volatile=True) # compute the sequence ID imbeddings self.audio_embeddings_dis = self.mfcc_encoder.forward( self.audios_dis).view(-1, 256 * self.opt.pred_length) self.lip_embeddings_dis = self.lip_feature_encoder.forward( self.video_dis).view(-1, 256 * self.opt.pred_length) # loss between audio and lip embedding self.lip_embedding_norm = embedding_utils.l2_norm( self.lip_embeddings_dis) self.audio_embedding_norm = embedding_utils.l2_norm( self.audio_embeddings_dis) self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data) self.EmbeddingL2 = self.L2Contrastive.forward( self.audio_embedding_norm, self.lip_embeddings_buffer) # generate fake images # classification self.audio_pred = self.model_fusion.forward(self.audio_embeddings_dis) self.audio_acc = self.compute_acc(self.audio_pred) self.image_pred = self.model_fusion.forward(self.lip_embeddings_dis) self.image_acc = self.compute_acc(self.image_pred) self.output = (self.audio_pred + self.image_pred) self.final_acc = self.compute_acc(self.output)
def forward(self): self.input_label = Variable(self.input_label) self.real_A = Variable(self.input_A) # 1张 图片 B_start = random.randint(0, self.opt.pred_length - self.opt.sequence_length) # 随机整数 [0, 12-6] self.audios_dis = Variable(self.audio_pred_data) self.video_dis = Variable(self.video_pred_data) # real_videos are the frames used for training generation, self.real_videos = Variable( self.video_pred_data[:, B_start:B_start + self.opt.sequence_length, :, :, :].contiguous( )) # 抽取6张图片 self.audios = Variable( self.audio_pred_data[:, B_start:B_start + self.opt.sequence_length, :, :, :].contiguous( )) #抽取6段音频 self.video_send_to_disfc = Variable(self.input_video_dis) # 视频-> 判别器 self.mask = Variable( self.Tensor( self.opt.batchSize, (self.opt.sequence_length) * self.opt.image_channel_size, self.opt.image_size, self.opt.image_size).fill_(0)) # (16,6*3,256,256) 全0填充 self.mask[:, :, 170:234, 64:192] = 1 # 图片 [170:234, 64:192] 填充为 1 self.mask_ones = Variable( self.Tensor(self.opt.batchSize, self.opt.image_channel_size, self.opt.image_size, self.opt.image_size).fill_(1)) # (16,3,256,256) 全 1 填充 self.mask_ones[:, :, 170:234, 64:192] = 0 # 图片 [170:234, 64:192] 填充为 0 self.mfcc_encoder.train() self.lip_feature_encoder.train() # compute the ID embeddings self.real_A_id_embedding = self.ID_encoder.forward(self.real_A) # compute the sequence ID imbeddings if self.opt.disfc_length == 12: self.sequence_id_embedding = self.ID_encoder.forward( self.video_dis) else: self.sequence_id_embedding = self.ID_encoder.forward( self.video_send_to_disfc) self.sequence_id_embedding = self.sequence_id_embedding[4].view( -1, self.opt.disfc_length * 64, 64, 64) # extract the lip feature # self.audio_embedding = self.mfcc_encoder.forward(self.audio_A) self.audio_embeddings_dis = self.mfcc_encoder.forward( self.audios_dis) # 提取 音频特征 (-1, 12, 256) self.lip_embeddings_dis = self.lip_feature_encoder.forward( self.video_dis) # 提取 语音的视频特征 (-1, 12, 256) self.audio_embeddings = self.audio_embeddings_dis[:, B_start:B_start + self.opt. sequence_length].contiguous( ) self.lip_embeddings = self.lip_embeddings_dis[:, B_start:B_start + self.opt. sequence_length].contiguous( ) # loss between audio and lip embedding 计算 音频特征与视频特征之间的距离 及 Lc self.lip_embedding_norm = embedding_utils.l2_norm( self.lip_embeddings_dis.view(-1, 256 * self.opt.pred_length)) # (-1,256*12) self.audio_embedding_norm = embedding_utils.l2_norm( self.audio_embeddings_dis.view( -1, 256 * self.opt.pred_length)) # (-1,256*12) self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data) self.EmbeddingL2 = self.L2Contrastive.forward( self.lip_embeddings_buffer, self.audio_embedding_norm) # generate fake images self.sequence_generation() # single (batch_size * 6 * 2, 3, 256, 256) self.fakes = torch.cat( (self.audio_gen_fakes_batch, self.image_gen_fakes_batch), 0) # cat((A,B),0) 按行拼接 A,B self.real_one = self.real_videos.view(-1, self.opt.image_channel_size, self.opt.image_size, self.opt.image_size) self.reals = torch.cat((self.real_one, self.real_one), 0) self.audio_reals = torch.cat( (self.audios.view(-1, 1, self.opt.mfcc_length, self.opt.mfcc_width), self.audios.view(-1, 1, self.opt.mfcc_length, self.opt.mfcc_width)), 0) # sequence (-1, 6 * 3, 256, 256) self.fakes_sequence = self.fakes.view( -1, self.opt.image_channel_size * (self.opt.sequence_length), self.opt.image_size, self.opt.image_size) self.real_one_sequence = self.real_videos.view( -1, self.opt.image_channel_size * (self.opt.sequence_length), self.opt.image_size, self.opt.image_size) self.reals_sequence = self.reals.view( -1, self.opt.image_channel_size * self.opt.sequence_length, self.opt.image_size, self.opt.image_size) self.audio_reals_sequence = self.audio_reals.view( -1, self.opt.sequence_length, self.opt.mfcc_length, self.opt.mfcc_width)
def forward(self): self.input_label = Variable(self.input_label) self.real_A = Variable(self.input_A) B_start = random.randint( 0, self.opt.pred_length - self.opt.sequence_length) self.audios_dis = Variable(self.audio_pred_data) self.video_dis = Variable(self.video_pred_data) # real_videos are the frames used for training generation, self.real_videos = Variable( self.video_pred_data[:, B_start:B_start + self.opt.sequence_length, :, :, :].contiguous( )) self.audios = Variable( self.audio_pred_data[:, B_start:B_start + self.opt.sequence_length, :, :, :].contiguous( )) self.video_send_to_disfc = Variable(self.input_video_dis) self.mask = Variable( self.Tensor(self.opt.batchSize, (self.opt.sequence_length) * self.opt.image_channel_size, self.opt.image_size, self.opt.image_size).fill_(0)) self.mask[:, :, 170:234, 64:192] = 1 self.mask_ones = Variable( self.Tensor(self.opt.batchSize, self.opt.image_channel_size, self.opt.image_size, self.opt.image_size).fill_(1)) self.mask_ones[:, :, 170:234, 64:192] = 0 self.mfcc_encoder.train() self.lip_feature_encoder.train() # compute the ID embeddings self.real_A_id_embedding = self.ID_encoder.forward(self.real_A) # compute the sequence ID imbeddings if self.opt.disfc_length == 12: self.sequence_id_embedding = self.ID_encoder.forward( self.video_dis) else: self.sequence_id_embedding = self.ID_encoder.forward( self.video_send_to_disfc) self.sequence_id_embedding = self.sequence_id_embedding[4].view( -1, self.opt.disfc_length * 64, 64, 64) # extract the lip feature # self.audio_embedding = self.mfcc_encoder.forward(self.audio_A) self.audio_embeddings_dis = self.mfcc_encoder.forward(self.audios_dis) self.lip_embeddings_dis = self.lip_feature_encoder.forward( self.video_dis) self.audio_embeddings = self.audio_embeddings_dis[:, B_start:B_start + self.opt. sequence_length].contiguous( ) self.lip_embeddings = self.lip_embeddings_dis[:, B_start:B_start + self.opt. sequence_length].contiguous( ) # loss between audio and lip embedding self.lip_embedding_norm = embedding_utils.l2_norm( self.lip_embeddings_dis.view(-1, 256 * self.opt.pred_length)) self.audio_embedding_norm = embedding_utils.l2_norm( self.audio_embeddings_dis.view(-1, 256 * self.opt.pred_length)) self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data) self.EmbeddingL2 = self.L2Contrastive.forward( self.lip_embeddings_buffer, self.audio_embedding_norm) # generate fake images self.sequence_generation() # single self.fakes = torch.cat( (self.audio_gen_fakes_batch, self.image_gen_fakes_batch), 0) self.real_one = self.real_videos.view(-1, self.opt.image_channel_size, self.opt.image_size, self.opt.image_size) self.reals = torch.cat((self.real_one, self.real_one), 0) self.audio_reals = torch.cat( (self.audios.view(-1, 1, self.opt.mfcc_length, self.opt.mfcc_width), self.audios.view(-1, 1, self.opt.mfcc_length, self.opt.mfcc_width)), 0) # sequence self.fakes_sequence = self.fakes.view( -1, self.opt.image_channel_size * (self.opt.sequence_length), self.opt.image_size, self.opt.image_size) self.real_one_sequence = self.real_videos.view( -1, self.opt.image_channel_size * (self.opt.sequence_length), self.opt.image_size, self.opt.image_size) self.reals_sequence = self.reals.view( -1, self.opt.image_channel_size * self.opt.sequence_length, self.opt.image_size, self.opt.image_size) self.audio_reals_sequence = self.audio_reals.view( -1, self.opt.sequence_length, self.opt.mfcc_length, self.opt.mfcc_width)