def test(self):
        self.mfcc_encoder.eval()
        self.lip_feature_encoder.eval()
        self.input_label = Variable(self.input_label, volatile=True)
        self.audios_dis = Variable(self.audio_pred_data, volatile=True)
        self.video_dis = Variable(self.video_pred_data, volatile=True)

        # compute the sequence ID imbeddings

        self.audio_embeddings_dis = self.mfcc_encoder.forward(
            self.audios_dis).view(-1, 256 * self.opt.pred_length)
        self.lip_embeddings_dis = self.lip_feature_encoder.forward(
            self.video_dis).view(-1, 256 * self.opt.pred_length)

        # loss between audio and lip embedding
        self.lip_embedding_norm = embedding_utils.l2_norm(
            self.lip_embeddings_dis)
        self.audio_embedding_norm = embedding_utils.l2_norm(
            self.audio_embeddings_dis)
        self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data)
        self.EmbeddingL2 = self.L2Contrastive.forward(
            self.audio_embedding_norm, self.lip_embeddings_buffer)
        # generate fake images
        # classification
        self.audio_pred = self.model_fusion.forward(self.audio_embeddings_dis)
        self.audio_acc = self.compute_acc(self.audio_pred)
        self.image_pred = self.model_fusion.forward(self.lip_embeddings_dis)
        self.image_acc = self.compute_acc(self.image_pred)
        self.output = (self.audio_pred + self.image_pred)
        self.final_acc = self.compute_acc(self.output)
    def forward(self):

        self.input_label = Variable(self.input_label)
        self.real_A = Variable(self.input_A)  # 1张 图片
        B_start = random.randint(0, self.opt.pred_length -
                                 self.opt.sequence_length)  # 随机整数 [0, 12-6]
        self.audios_dis = Variable(self.audio_pred_data)
        self.video_dis = Variable(self.video_pred_data)
        # real_videos are the frames used for training generation,
        self.real_videos = Variable(
            self.video_pred_data[:, B_start:B_start +
                                 self.opt.sequence_length, :, :, :].contiguous(
                                 ))  # 抽取6张图片
        self.audios = Variable(
            self.audio_pred_data[:, B_start:B_start +
                                 self.opt.sequence_length, :, :, :].contiguous(
                                 ))  #抽取6段音频
        self.video_send_to_disfc = Variable(self.input_video_dis)  # 视频-> 判别器
        self.mask = Variable(
            self.Tensor(
                self.opt.batchSize,
                (self.opt.sequence_length) * self.opt.image_channel_size,
                self.opt.image_size,
                self.opt.image_size).fill_(0))  # (16,6*3,256,256)  全0填充
        self.mask[:, :, 170:234, 64:192] = 1  # 图片 [170:234, 64:192] 填充为 1
        self.mask_ones = Variable(
            self.Tensor(self.opt.batchSize, self.opt.image_channel_size,
                        self.opt.image_size,
                        self.opt.image_size).fill_(1))  # (16,3,256,256) 全 1 填充
        self.mask_ones[:, :, 170:234, 64:192] = 0  # 图片 [170:234, 64:192] 填充为 0
        self.mfcc_encoder.train()
        self.lip_feature_encoder.train()

        # compute the ID embeddings
        self.real_A_id_embedding = self.ID_encoder.forward(self.real_A)

        # compute the sequence ID imbeddings
        if self.opt.disfc_length == 12:
            self.sequence_id_embedding = self.ID_encoder.forward(
                self.video_dis)
        else:
            self.sequence_id_embedding = self.ID_encoder.forward(
                self.video_send_to_disfc)
        self.sequence_id_embedding = self.sequence_id_embedding[4].view(
            -1, self.opt.disfc_length * 64, 64, 64)

        # extract the lip feature

        # self.audio_embedding = self.mfcc_encoder.forward(self.audio_A)
        self.audio_embeddings_dis = self.mfcc_encoder.forward(
            self.audios_dis)  # 提取 音频特征   (-1, 12, 256)
        self.lip_embeddings_dis = self.lip_feature_encoder.forward(
            self.video_dis)  # 提取 语音的视频特征    (-1, 12, 256)
        self.audio_embeddings = self.audio_embeddings_dis[:, B_start:B_start +
                                                          self.opt.
                                                          sequence_length].contiguous(
                                                          )
        self.lip_embeddings = self.lip_embeddings_dis[:, B_start:B_start +
                                                      self.opt.
                                                      sequence_length].contiguous(
                                                      )

        # loss between audio and lip embedding  计算 音频特征与视频特征之间的距离   及 Lc
        self.lip_embedding_norm = embedding_utils.l2_norm(
            self.lip_embeddings_dis.view(-1, 256 *
                                         self.opt.pred_length))  # (-1,256*12)
        self.audio_embedding_norm = embedding_utils.l2_norm(
            self.audio_embeddings_dis.view(
                -1, 256 * self.opt.pred_length))  # (-1,256*12)
        self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data)
        self.EmbeddingL2 = self.L2Contrastive.forward(
            self.lip_embeddings_buffer, self.audio_embedding_norm)
        # generate fake images

        self.sequence_generation()

        # single   (batch_size * 6 * 2, 3, 256, 256)
        self.fakes = torch.cat(
            (self.audio_gen_fakes_batch, self.image_gen_fakes_batch),
            0)  # cat((A,B),0) 按行拼接 A,B
        self.real_one = self.real_videos.view(-1, self.opt.image_channel_size,
                                              self.opt.image_size,
                                              self.opt.image_size)
        self.reals = torch.cat((self.real_one, self.real_one), 0)
        self.audio_reals = torch.cat(
            (self.audios.view(-1, 1, self.opt.mfcc_length,
                              self.opt.mfcc_width),
             self.audios.view(-1, 1, self.opt.mfcc_length,
                              self.opt.mfcc_width)), 0)

        # sequence      (-1, 6 * 3, 256, 256)
        self.fakes_sequence = self.fakes.view(
            -1, self.opt.image_channel_size * (self.opt.sequence_length),
            self.opt.image_size, self.opt.image_size)
        self.real_one_sequence = self.real_videos.view(
            -1, self.opt.image_channel_size * (self.opt.sequence_length),
            self.opt.image_size, self.opt.image_size)
        self.reals_sequence = self.reals.view(
            -1, self.opt.image_channel_size * self.opt.sequence_length,
            self.opt.image_size, self.opt.image_size)
        self.audio_reals_sequence = self.audio_reals.view(
            -1, self.opt.sequence_length, self.opt.mfcc_length,
            self.opt.mfcc_width)
예제 #3
0
    def forward(self):

        self.input_label = Variable(self.input_label)
        self.real_A = Variable(self.input_A)
        B_start = random.randint(
            0, self.opt.pred_length - self.opt.sequence_length)
        self.audios_dis = Variable(self.audio_pred_data)
        self.video_dis = Variable(self.video_pred_data)
        # real_videos are the frames used for training generation,
        self.real_videos = Variable(
            self.video_pred_data[:, B_start:B_start +
                                 self.opt.sequence_length, :, :, :].contiguous(
                                 ))
        self.audios = Variable(
            self.audio_pred_data[:, B_start:B_start +
                                 self.opt.sequence_length, :, :, :].contiguous(
                                 ))
        self.video_send_to_disfc = Variable(self.input_video_dis)
        self.mask = Variable(
            self.Tensor(self.opt.batchSize, (self.opt.sequence_length) *
                        self.opt.image_channel_size, self.opt.image_size,
                        self.opt.image_size).fill_(0))
        self.mask[:, :, 170:234, 64:192] = 1
        self.mask_ones = Variable(
            self.Tensor(self.opt.batchSize, self.opt.image_channel_size,
                        self.opt.image_size, self.opt.image_size).fill_(1))
        self.mask_ones[:, :, 170:234, 64:192] = 0
        self.mfcc_encoder.train()
        self.lip_feature_encoder.train()

        # compute the ID embeddings
        self.real_A_id_embedding = self.ID_encoder.forward(self.real_A)

        # compute the sequence ID imbeddings
        if self.opt.disfc_length == 12:
            self.sequence_id_embedding = self.ID_encoder.forward(
                self.video_dis)
        else:
            self.sequence_id_embedding = self.ID_encoder.forward(
                self.video_send_to_disfc)
        self.sequence_id_embedding = self.sequence_id_embedding[4].view(
            -1, self.opt.disfc_length * 64, 64, 64)

        # extract the lip feature

        # self.audio_embedding = self.mfcc_encoder.forward(self.audio_A)
        self.audio_embeddings_dis = self.mfcc_encoder.forward(self.audios_dis)
        self.lip_embeddings_dis = self.lip_feature_encoder.forward(
            self.video_dis)
        self.audio_embeddings = self.audio_embeddings_dis[:, B_start:B_start +
                                                          self.opt.
                                                          sequence_length].contiguous(
                                                          )
        self.lip_embeddings = self.lip_embeddings_dis[:, B_start:B_start +
                                                      self.opt.
                                                      sequence_length].contiguous(
                                                      )

        # loss between audio and lip embedding
        self.lip_embedding_norm = embedding_utils.l2_norm(
            self.lip_embeddings_dis.view(-1, 256 * self.opt.pred_length))
        self.audio_embedding_norm = embedding_utils.l2_norm(
            self.audio_embeddings_dis.view(-1, 256 * self.opt.pred_length))
        self.lip_embeddings_buffer = Variable(self.lip_embedding_norm.data)
        self.EmbeddingL2 = self.L2Contrastive.forward(
            self.lip_embeddings_buffer, self.audio_embedding_norm)
        # generate fake images

        self.sequence_generation()

        # single
        self.fakes = torch.cat(
            (self.audio_gen_fakes_batch, self.image_gen_fakes_batch), 0)
        self.real_one = self.real_videos.view(-1, self.opt.image_channel_size,
                                              self.opt.image_size,
                                              self.opt.image_size)
        self.reals = torch.cat((self.real_one, self.real_one), 0)
        self.audio_reals = torch.cat(
            (self.audios.view(-1, 1, self.opt.mfcc_length,
                              self.opt.mfcc_width),
             self.audios.view(-1, 1, self.opt.mfcc_length,
                              self.opt.mfcc_width)), 0)

        # sequence
        self.fakes_sequence = self.fakes.view(
            -1, self.opt.image_channel_size * (self.opt.sequence_length),
            self.opt.image_size, self.opt.image_size)
        self.real_one_sequence = self.real_videos.view(
            -1, self.opt.image_channel_size * (self.opt.sequence_length),
            self.opt.image_size, self.opt.image_size)
        self.reals_sequence = self.reals.view(
            -1, self.opt.image_channel_size * self.opt.sequence_length,
            self.opt.image_size, self.opt.image_size)
        self.audio_reals_sequence = self.audio_reals.view(
            -1, self.opt.sequence_length, self.opt.mfcc_length,
            self.opt.mfcc_width)