예제 #1
0
def animate(args):

    # get context
    ctx = get_extension_context(args.context)
    nn.set_default_context(ctx)
    logger.setLevel(logging.ERROR)  # to supress minor messages

    if not args.config:
        assert not args.params, "pretrained weights file is given, but corresponding config file is not. Please give both."
        download_provided_file(
            "https://nnabla.org/pretrained-models/nnabla-examples/GANs/first-order-model/voxceleb_trained_info.yaml"
        )
        args.config = 'voxceleb_trained_info.yaml'

        download_provided_file(
            "https://nnabla.org/pretrained-models/nnabla-examples/GANs/first-order-model/pretrained_fomm_params.h5"
        )

    config = read_yaml(args.config)

    dataset_params = config.dataset_params
    model_params = config.model_params

    if args.detailed:
        vis_params = config.visualizer_params
        visualizer = Visualizer(**vis_params)

    if not args.params:
        assert "log_dir" in config, "no log_dir found in config. therefore failed to locate pretrained parameters."
        param_file = os.path.join(config.log_dir, config.saved_parameters)
    else:
        param_file = args.params
    print(f"Loading {param_file} for image animation...")
    nn.load_parameters(param_file)

    bs, h, w, c = [1] + dataset_params.frame_shape
    source = nn.Variable((bs, c, h, w))
    driving_initial = nn.Variable((bs, c, h, w))
    driving = nn.Variable((bs, c, h, w))

    filename = args.driving

    # process repeated until all the test data is used
    driving_video = read_video(
        filename, dataset_params.frame_shape)  # (#frames, h, w, 3)
    driving_video = np.transpose(driving_video,
                                 (0, 3, 1, 2))  # (#frames, 3, h, w)

    source_img = imread(args.source, channel_first=True,
                        size=(256, 256)) / 255.
    source_img = source_img[:3]

    source.d = np.expand_dims(source_img, 0)
    driving_initial.d = driving_video[0][:3, ]

    with nn.parameter_scope("kp_detector"):
        kp_source = detect_keypoint(source,
                                    **model_params.kp_detector_params,
                                    **model_params.common_params,
                                    test=True,
                                    comm=False)
        persistent_all(kp_source)

    with nn.parameter_scope("kp_detector"):
        kp_driving_initial = detect_keypoint(driving_initial,
                                             **model_params.kp_detector_params,
                                             **model_params.common_params,
                                             test=True,
                                             comm=False)
        persistent_all(kp_driving_initial)

    with nn.parameter_scope("kp_detector"):
        kp_driving = detect_keypoint(driving,
                                     **model_params.kp_detector_params,
                                     **model_params.common_params,
                                     test=True,
                                     comm=False)
        persistent_all(kp_driving)

    if args.adapt_movement_scale:
        nn.forward_all([
            kp_source["value"], kp_source["jacobian"],
            kp_driving_initial["value"], kp_driving_initial["jacobian"]
        ])
        source_area = ConvexHull(kp_source['value'].d[0]).volume
        driving_area = ConvexHull(kp_driving_initial['value'].d[0]).volume
        adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area)
    else:
        adapt_movement_scale = 1

    kp_norm = adjust_kp(kp_source=unlink_all(kp_source),
                        kp_driving=kp_driving,
                        kp_driving_initial=unlink_all(kp_driving_initial),
                        adapt_movement_scale=adapt_movement_scale,
                        use_relative_movement=args.unuse_relative_movement,
                        use_relative_jacobian=args.unuse_relative_jacobian)
    persistent_all(kp_norm)

    with nn.parameter_scope("generator"):
        generated = occlusion_aware_generator(source,
                                              kp_source=unlink_all(kp_source),
                                              kp_driving=kp_norm,
                                              **model_params.generator_params,
                                              **model_params.common_params,
                                              test=True,
                                              comm=False)

    if not args.full and 'sparse_deformed' in generated:
        del generated['sparse_deformed']  # remove needless info

    persistent_all(generated)

    generated['kp_driving'] = kp_driving
    generated['kp_source'] = kp_source
    generated['kp_norm'] = kp_norm

    # generated contains these values;
    # 'mask': <Variable((bs, num_kp+1, h/4, w/4)) when scale_factor=0.25
    # 'sparse_deformed': <Variable((bs, num_kp+1, num_channel, h/4, w/4))  # (bs, num_kp + 1, c, h, w)
    # 'occlusion_map': <Variable((bs, 1, h/4, w/4))
    # 'deformed': <Variable((bs, c, h, w))
    # 'prediction': <Variable((bs, c, h, w))

    mode = "arbitrary"
    if "log_dir" in config:
        result_dir = os.path.join(args.out_dir,
                                  os.path.basename(config.log_dir), f"{mode}")
    else:
        result_dir = os.path.join(args.out_dir, "test_result", f"{mode}")

    # create an empty directory to save generated results
    _ = nm.Monitor(result_dir)

    # load the header images.
    header = imread("imgs/header_combined.png", channel_first=True)
    generated_images = list()

    # compute these in advance and reuse
    nn.forward_all([kp_source["value"], kp_source["jacobian"]],
                   clear_buffer=True)
    nn.forward_all(
        [kp_driving_initial["value"], kp_driving_initial["jacobian"]],
        clear_buffer=True)

    num_of_driving_frames = driving_video.shape[0]

    for frame_idx in tqdm(range(num_of_driving_frames)):
        driving.d = driving_video[frame_idx][:3, ]
        nn.forward_all([generated["prediction"], generated["deformed"]],
                       clear_buffer=True)

        if args.detailed:
            # visualize source w/kp, driving w/kp, deformed source, generated w/kp, generated image, occlusion map
            visualization = visualizer.visualize(source=source.d,
                                                 driving=driving.d,
                                                 out=generated)
            if args.full:
                visualization = reshape_result(visualization)  # (H, W, C)
            combined_image = visualization.transpose(2, 0, 1)  # (C, H, W)

        elif args.only_generated:
            combined_image = np.clip(generated["prediction"].d[0], 0.0, 1.0)
            combined_image = (255 * combined_image).astype(
                np.uint8)  # (C, H, W)

        else:
            # visualize source, driving, and generated image
            driving_fake = np.concatenate([
                np.clip(driving.d[0], 0.0, 1.0),
                np.clip(generated["prediction"].d[0], 0.0, 1.0)
            ],
                                          axis=2)
            header_source = np.concatenate([
                np.clip(header / 255., 0.0, 1.0),
                np.clip(source.d[0], 0.0, 1.0)
            ],
                                           axis=2)
            combined_image = np.concatenate([header_source, driving_fake],
                                            axis=1)
            combined_image = (255 * combined_image).astype(np.uint8)

        generated_images.append(combined_image)

    # once each video is generated, save it.
    output_filename = f"{os.path.splitext(os.path.basename(filename))[0]}.mp4"
    output_filename = f"{os.path.basename(args.source)}_by_{output_filename}"
    output_filename = output_filename.replace("#", "_")
    if args.output_png:
        monitor_vis = nm.MonitorImage(output_filename,
                                      nm.Monitor(result_dir),
                                      interval=1,
                                      num_images=1,
                                      normalize_method=lambda x: x)
        for frame_idx, img in enumerate(generated_images):
            monitor_vis.add(frame_idx, img)
    else:
        generated_images = [_.transpose(1, 2, 0) for _ in generated_images]
        # you might need to change ffmpeg_params according to your environment.
        mimsave(f'{os.path.join(result_dir, output_filename)}',
                generated_images,
                fps=args.fps,
                ffmpeg_params=[
                    "-pix_fmt", "yuv420p", "-vcodec", "libx264", "-f", "mp4",
                    "-q", "0"
                ])

    return
예제 #2
0
    Logger.load_cpk(opt.checkpoint,
                    generator=generator,
                    kp_detector=kp_detector,
                    use_cpu=False)

    vis = Visualizer()

    # generator = DataParallelWithCallback(generator)
    # kp_detector = DataParallelWithCallback(kp_detector)

    generator.eval()
    kp_detector.eval()

    with torch.no_grad():
        driving_video = VideoToTensor()(read_video(
            opt.driving_video, opt.image_shape + (3, )))['video']
        source_image = VideoToTensor()(read_video(
            opt.source_image, opt.image_shape + (3, )))['video'][:, :1]
        print(source_image.shape)

        driving_video = torch.from_numpy(driving_video).unsqueeze(0)
        source_image = torch.from_numpy(source_image).unsqueeze(0)

        out = transfer_one(generator, kp_detector, source_image, driving_video,
                           config['transfer_params'])
        '''
        # Pickle the out 
        f = open('keypoints.pkl', 'wb')
        pickle.dump(out, f) 
        f.close()
        '''
예제 #3
0
    def __getitem__(self, idx):
        # will return index of source videos
        # and randomly return index of a target videos
        if self.is_train and self.id_sampling:
            name_source = self.source_videos[idx]
            path_source = np.random.choice(
                glob.glob(os.path.join(self.source_dir,
                                       name_source + '*.mp4')))
            name_target = self.target_videos[idx % len(self.target_videos)]
            path_target = np.random.choice(
                glob.glob(os.path.join(self.target_dir,
                                       name_target + '*.mp4')))
        else:
            name_source = self.source_videos[idx]
            path_source = os.path.join(self.source_dir, name_source)
            name_target = self.target_videos[idx % len(self.target_videos)]
            path_target = os.path.join(self.target_dir, name_target)

        video_src_name = os.path.basename(path_source)
        video_tar_name = os.path.basename(path_target)

        # handle source
        # 此情况是 path 是一个文件夹,里面装了每一帧的 png
        if self.is_train and os.path.isdir(path_source):
            frames = os.listdir(path_source)
            num_frames = len(frames)
            frame_idx = np.sort(
                np.random.choice(num_frames, replace=True, size=2))
            source_array = [
                img_as_float32(
                    io.imread(os.path.join(path_source, frames[idx])))
                for idx in frame_idx
            ]
        else:
            # 读取视频
            source_array = read_video(path_source,
                                      frame_shape=self.frame_shape)
            num_frames = len(source_array)
            # 此处根据模式选项,打乱了视频顺序
            frame_idx = np.sort(
                np.random.choice(
                    num_frames, replace=True,
                    size=2)) if self.is_train else range(num_frames)
            source_array = source_array[frame_idx]
        # handle target
        # 此情况是 path 是一个文件夹,里面装了每一帧的 png
        if self.is_train and os.path.isdir(path_target):
            frames = os.listdir(path_target)
            num_frames = len(frames)
            frame_idx = np.sort(
                np.random.choice(num_frames, replace=True, size=2))
            target_array = [
                img_as_float32(
                    io.imread(os.path.join(path_target, frames[idx])))
                for idx in frame_idx
            ]
        else:
            # 读取视频
            target_array = read_video(path_target,
                                      frame_shape=self.frame_shape)
            num_frames = len(target_array)
            # 此处根据模式选项,打乱了视频顺序
            frame_idx = np.sort(
                np.random.choice(
                    num_frames, replace=True,
                    size=2)) if self.is_train else range(num_frames)
            target_array = target_array[frame_idx]

        if self.transform is not None:
            source_array = self.transform(source_array)
            target_array = self.transform(target_array)

        out = {}
        # 构建输出字典
        if self.is_train:
            # 输出的时候只选取了前两帧作为源和驱动
            # 注:此处把 channel 作为第一个维度输出了
            s_source = np.array(source_array[0], dtype='float32')
            s_driving = np.array(source_array[1], dtype='float32')
            t_source = np.array(target_array[0], dtype='float32')
            t_driving = np.array(target_array[1], dtype='float32')

            out['driving'] = s_driving.transpose((2, 0, 1))
            out['source'] = s_source.transpose((2, 0, 1))
            out['t_driving'] = t_driving.transpose((2, 0, 1))
            out['t_source'] = t_source.transpose((2, 0, 1))
        else:
            video = np.array(source_array, dtype='float32')
            out['video'] = video.transpose((3, 0, 1, 2))
            video = np.array(target_array, dtype='float32')
            out['t_video'] = video.transpose((3, 0, 1, 2))

        out['name'] = video_src_name
        out['t_name'] = video_tar_name

        return out
예제 #4
0
def reconstruct(args):

    # get context
    ctx = get_extension_context(args.context)
    nn.set_default_context(ctx)
    logger.setLevel(logging.ERROR)  # to supress minor messages

    config = read_yaml(args.config)

    dataset_params = config.dataset_params
    model_params = config.model_params

    if args.detailed:
        vis_params = config.visualizer_params
        visualizer = Visualizer(**vis_params)

    if not args.params:
        assert "log_dir" in config, "no log_dir found in config. therefore failed to locate pretrained parameters."
        param_file = os.path.join(
            config.log_dir, config.saved_parameters)
    else:
        param_file = args.params
    nn.load_parameters(param_file)

    bs, h, w, c = [1] + dataset_params.frame_shape
    source = nn.Variable((bs, c, h, w))
    driving_initial = nn.Variable((bs, c, h, w))
    driving = nn.Variable((bs, c, h, w))

    with nn.parameter_scope("kp_detector"):
        kp_source = detect_keypoint(source,
                                    **model_params.kp_detector_params,
                                    **model_params.common_params,
                                    test=True, comm=False)
        persistent_all(kp_source)

    with nn.parameter_scope("kp_detector"):
        kp_driving = detect_keypoint(driving,
                                     **model_params.kp_detector_params,
                                     **model_params.common_params,
                                     test=True, comm=False)
        persistent_all(kp_driving)

    with nn.parameter_scope("generator"):
        generated = occlusion_aware_generator(source,
                                              kp_source=unlink_all(kp_source),
                                              kp_driving=kp_driving,
                                              **model_params.generator_params,
                                              **model_params.common_params,
                                              test=True, comm=False)

    if not args.full and 'sparse_deformed' in generated:
        del generated['sparse_deformed']  # remove needless info

    persistent_all(generated)

    generated['kp_driving'] = kp_driving
    generated['kp_source'] = kp_source

    # generated contains these values;
    # 'mask': <Variable((bs, num_kp+1, h/4, w/4)) when scale_factor=0.25
    # 'sparse_deformed': <Variable((bs, num_kp+1, num_channel, h/4, w/4))  # (bs, num_kp + 1, c, h, w)
    # 'occlusion_map': <Variable((bs, 1, h/4, w/4))
    # 'deformed': <Variable((bs, c, h, w))
    # 'prediction': <Variable((bs, c, h, w))

    mode = "reconstruction"
    if "log_dir" in config:
        result_dir = os.path.join(args.out_dir, os.path.basename(config.log_dir), f"{mode}")
    else:
        result_dir = os.path.join(args.out_dir, "test_result", f"{mode}")

    # create an empty directory to save generated results
    _ = nm.Monitor(result_dir)
    if args.eval:
        os.makedirs(os.path.join(result_dir, "png"), exist_ok=True)

    # load the header images.
    header = imread("imgs/header_combined.png", channel_first=True)

    filenames = sorted(glob.glob(os.path.join(
        dataset_params.root_dir, "test", "*")))
    recon_loss_list = list()

    for filename in tqdm(filenames):
        # process repeated until all the test data is used
        driving_video = read_video(
            filename, dataset_params.frame_shape)  # (#frames, h, w, 3)
        driving_video = np.transpose(
            driving_video, (0, 3, 1, 2))  # (#frames, 3, h, w)

        generated_images = list()
        source_img = driving_video[0]

        source.d = np.expand_dims(source_img, 0)
        driving_initial.d = driving_video[0]

        # compute these in advance and reuse
        nn.forward_all(
            [kp_source["value"], kp_source["jacobian"]], clear_buffer=True)

        num_of_driving_frames = driving_video.shape[0]

        for frame_idx in tqdm(range(num_of_driving_frames)):
            driving.d = driving_video[frame_idx]
            nn.forward_all([generated["prediction"],
                            generated["deformed"]], clear_buffer=True)

            if args.detailed:
                # visualize source w/kp, driving w/kp, deformed source, generated w/kp, generated image, occlusion map
                visualization = visualizer.visualize(
                    source=source.d, driving=driving.d, out=generated)
                if args.full:
                    visualization = reshape_result(visualization)  # (H, W, C)
                combined_image = visualization.transpose(2, 0, 1)  # (C, H, W)

            elif args.only_generated:
                combined_image = np.clip(
                    generated["prediction"].d[0], 0.0, 1.0)
                combined_image = (
                    255*combined_image).astype(np.uint8)  # (C, H, W)

            else:
                # visualize source, driving, and generated image
                driving_fake = np.concatenate([np.clip(driving.d[0], 0.0, 1.0),
                                               np.clip(generated["prediction"].d[0], 0.0, 1.0)], axis=2)
                header_source = np.concatenate([np.clip(header / 255., 0.0, 1.0),
                                                np.clip(source.d[0], 0.0, 1.0)], axis=2)
                combined_image = np.concatenate(
                    [header_source, driving_fake], axis=1)
                combined_image = (255*combined_image).astype(np.uint8)

            generated_images.append(combined_image)
            # compute L1 distance per frame.
            recon_loss_list.append(
                np.mean(np.abs(generated["prediction"].d[0] - driving.d[0])))

        # post process only for reconstruction evaluation.
        if args.eval:
            # crop generated images region only.
            if args.only_generated:
                eval_images = generated_images
            elif args.full:
                eval_images = [_[:, :h, 4*w:5*w] for _ in generated_images]
            elif args.detailed:
                assert generated_images[0].shape == (c, h, 5*w)
                eval_images = [_[:, :, 3*w:4*w] for _ in generated_images]
            else:
                eval_images = [_[:, h:, w:] for _ in generated_images]
            # place them horizontally and save for evaluation.
            image_for_eval = np.concatenate(
                eval_images, axis=2).transpose(1, 2, 0)
            imsave(os.path.join(result_dir, "png", f"{os.path.basename(filename)}.png"),
                   image_for_eval)

        # once each video is generated, save it.
        output_filename = f"{os.path.splitext(os.path.basename(filename))[0]}.mp4"
        if args.output_png:
            monitor_vis = nm.MonitorImage(output_filename, nm.Monitor(result_dir),
                                          interval=1, num_images=1,
                                          normalize_method=lambda x: x)
            for frame_idx, img in enumerate(generated_images):
                monitor_vis.add(frame_idx, img)
        else:
            generated_images = [_.transpose(1, 2, 0) for _ in generated_images]
            # you might need to change ffmpeg_params according to your environment.
            mimsave(f'{os.path.join(result_dir, output_filename)}', generated_images,
                    fps=args.fps,
                    ffmpeg_params=["-pix_fmt", "yuv420p",
                                   "-vcodec", "libx264",
                                   "-f", "mp4",
                                   "-q", "0"])
    print(f"Reconstruction loss: {np.mean(recon_loss_list)}")

    return
예제 #5
0
    extract the pose points for the first frame of the GIF for each GIF. This allows for an alignment based
    on only the first frame. 
    
    TODO: Extend this to extract poses from the driving video to then
    obtain poses at each frame for alignment. 

    '''
    with torch.no_grad():
        
        # This dictionary stores the initial pose for each of the GIFs
        poses_dict = {}

        for img_name in tqdm(os.listdir(opt.driving_directory)):
            
            path_name = opt.driving_directory + img_name
            driving_video = VideoToTensor()(read_video(path_name, opt.image_shape + (3,)))['video']
           
            driving_video = torch.from_numpy(driving_video).unsqueeze(0)

            cat_dict = lambda l, dim: {k: torch.cat([v[k] for v in l], dim=dim) for k in l[0]}
            d = driving_video.shape[2] 
            
            kp_driving = cat_dict([kp_detector(driving_video[:,:,i:(i+1)]) for i in range(d)], dim=1) 

            poses_dict[img_name] = kp_driving 
        
        # Dump the poses dict 
        pickle.dump(poses_dict, open('./driving_video_poses/{}_poses.pkl'.format(opt.name), 'wb')) 


    extract the pose points for the first frame of the GIF for each GIF. This allows for an alignment based
    on only the first frame. 
    
    TODO: Extend this to extract poses from the driving video to then
    obtain poses at each frame for alignment. 

    '''
    with torch.no_grad():
        
        # This dictionary stores the initial pose for each of the GIFs
        poses_dict = {}

        for img_name in tqdm(os.listdir(opt.source_directory)):
            
            path_name = opt.source_directory + img_name
            source_image = VideoToTensor()(read_video(path_name, 
                opt.image_shape + (3,)))['video'][:, :1]
           
            print(source_image.shape) 
            source_image = torch.from_numpy(source_image).unsqueeze(0)

            #Extract the mean of the keypoints 
            mean = kp_detector(source_image)['mean'].data.cpu().numpy() 
            # Apply the transformation 
            key_points = 128 * (mean + 1) / 2 
            
            poses_dict[img_name[:-4]] = key_points 

            if opt.visualize:
                # img = vis.visualize_initial_pose(source_image, mean) 
                
                img = plt.imread(path_name)