Exemplo n.º 1
0
def get_demo_vibe_model():
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')
    return model
Exemplo n.º 2
0
def get_put(model_1_ready, queue_img, queue_dettra, total_time):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mot = MPT(device=device, batch_size=1, output_format='dict')
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')
    model_1_ready.value += 1
    total_time.value = time.time()
    i = 0
    while True:
        try:
            total_time = time.time()
            frame_orig = queue_img.get(True, 50)
            # block为True,就是如果队列中无数据了。
            #   |—————— 若timeout默认是None,那么会一直等待下去。
            #   |—————— 若timeout设置了时间,那么会等待timeout秒后才会抛出Queue.Empty异常
            # block 为False,如果队列中无数据,就抛出Queue.Empty异常
            i += 1
            print('dettra22222', i)
            vibe_results = detect_track_vibe(frame_orig, mot, model, device)
            queue_dettra.put((frame_orig, vibe_results))
            total_time = time.time() - total_time
            print(f'Detection+Tracking FPS : {1 / total_time:.2f}.')
        except QueueEmpty:
            break
Exemplo n.º 3
0
def main(args):
    torch.cuda.set_device(args.gpu_id)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    print(f'Loading video list {args.video_list}')
    video_list = [l.strip() for l in open(args.video_list, 'r').readlines()]
    if len(video_list) < 1:
        print('No files were found in video list')
        return

    print('Loading VIBE model')
    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load VIBE pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    num_videos = len(video_list)
    print(f'Processing {num_videos} videos.')
    for video_idx, video_file in enumerate(video_list, start=1):
        if not osp.isfile(video_file):
            print(f'Input video \"{video_file}\" does not exist! Moving on to next file.')
            continue

        filename = osp.splitext(osp.basename(video_file))[0]
        output_path = osp.join(args.output_folder, filename)
        os.makedirs(output_path, exist_ok=True)

        image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True)

        print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames')
        orig_height, orig_width = img_shape[:2]

        # ========= Run tracking ========= #
        bbox_scale = 1.1
        if args.tracking_method == 'pose':
            if not osp.isabs(video_file):
                video_file = osp.join(os.getcwd(), video_file)
            tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display)
        else:
            # run multi object tracker
            mot = MPT(
                device=device,
                batch_size=args.tracker_batch_size,
                display=args.display,
                detector_type=args.detector,
                output_format='dict',
                yolo_img_size=args.yolo_img_size,
            )
            tracking_results = mot(image_folder)

        # remove tracklets if num_frames is less than MIN_NUM_FRAMES
        for person_id in list(tracking_results.keys()):
            if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
                del tracking_results[person_id]

        # ========= Run VIBE on each person ========= #
        print(f'Running VIBE on each tracklet...')
        vibe_results = {}
        for person_id in tqdm(list(tracking_results.keys())):
            bboxes = joints2d = None

            if args.tracking_method == 'bbox':
                bboxes = tracking_results[person_id]['bbox']
            elif args.tracking_method == 'pose':
                joints2d = tracking_results[person_id]['joints2d']

            frames = tracking_results[person_id]['frames']

            dataset = Inference(
                image_folder=image_folder,
                frames=frames,
                bboxes=bboxes,
                joints2d=joints2d,
                scale=bbox_scale,
            )

            bboxes = dataset.bboxes
            frames = dataset.frames
            has_keypoints = True if joints2d is not None else False

            dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16)

            with torch.no_grad():

                pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

                for batch in dataloader:
                    if has_keypoints:
                        batch, nj2d = batch
                        norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                    batch = batch.unsqueeze(0)
                    batch = batch.to(device)

                    batch_size, seqlen = batch.shape[:2]
                    output = model(batch)[-1]

                    pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1))
                    pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3))
                    pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1))
                    pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1))
                    pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3))


                pred_cam = torch.cat(pred_cam, dim=0)
                pred_verts = torch.cat(pred_verts, dim=0)
                pred_pose = torch.cat(pred_pose, dim=0)
                pred_betas = torch.cat(pred_betas, dim=0)
                pred_joints3d = torch.cat(pred_joints3d, dim=0)

                del batch

            # ========= [Optional] run Temporal SMPLify to refine the results ========= #
            if args.run_smplify and args.tracking_method == 'pose':
                norm_joints2d = np.concatenate(norm_joints2d, axis=0)
                norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
                norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

                # Run Temporal SMPLify
                update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
                new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                    pred_rotmat=pred_pose,
                    pred_betas=pred_betas,
                    pred_cam=pred_cam,
                    j2d=norm_joints2d,
                    device=device,
                    batch_size=norm_joints2d.shape[0],
                    pose2aa=False,
                )

                # update the parameters after refinement
                print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}')
                pred_verts = pred_verts.cpu()
                pred_cam = pred_cam.cpu()
                pred_pose = pred_pose.cpu()
                pred_betas = pred_betas.cpu()
                pred_joints3d = pred_joints3d.cpu()
                pred_verts[update] = new_opt_vertices[update]
                pred_cam[update] = new_opt_cam[update]
                pred_pose[update] = new_opt_pose[update]
                pred_betas[update] = new_opt_betas[update]
                pred_joints3d[update] = new_opt_joints3d[update]

            elif args.run_smplify and args.tracking_method == 'bbox':
                print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!')
                print('[WARNING] Continuing without running Temporal SMPLify!..')

            # ========= Save results to a pickle file ========= #
            pred_cam = pred_cam.cpu().numpy()
            pred_verts = pred_verts.cpu().numpy()
            pred_pose = pred_pose.cpu().numpy()
            pred_betas = pred_betas.cpu().numpy()
            pred_joints3d = pred_joints3d.cpu().numpy()

            orig_cam = convert_crop_cam_to_orig_img(
                cam=pred_cam,
                bbox=bboxes,
                img_width=orig_width,
                img_height=orig_height
            )

            output_dict = {
                'pred_cam': pred_cam,
                'orig_cam': orig_cam,
                'verts': pred_verts,
                'pose': pred_pose,
                'betas': pred_betas,
                'joints3d': pred_joints3d,
                'joints2d': joints2d,
                'bboxes': bboxes,
                'frame_ids': frames,
            }

            vibe_results[person_id] = output_dict

        # Clean-up the temporal folder

        # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path)
        output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl')
        print(f'Saving output results to \"{output_pkl_path}\".')
        joblib.dump(vibe_results, output_pkl_path)
        if not args.no_render:
            # ========= Render results as a single video ========= #
            renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe)

            output_img_folder = f'{image_folder}_output'
            os.makedirs(output_img_folder, exist_ok=True)

            print(f'Rendering output video, writing frames to {output_img_folder}')

            # prepare results for rendering
            frame_results = prepare_rendering_results(vibe_results, num_frames)
            mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()}

            image_file_names = sorted([
                os.path.join(image_folder, x)
                for x in os.listdir(image_folder)
                if x.endswith('.png') or x.endswith('.jpg')
            ])

            for frame_idx in tqdm(range(len(image_file_names))):
                img_fname = image_file_names[frame_idx]
                img = cv2.imread(img_fname)

                if args.sideview:
                    side_img = np.zeros_like(img)

                for person_id, person_data in frame_results[frame_idx].items():
                    frame_verts = person_data['verts']
                    frame_cam = person_data['cam']

                    mc = mesh_color[person_id]

                    mesh_filename = None

                    if args.save_obj:
                        mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}')
                        os.makedirs(mesh_folder, exist_ok=True)
                        mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj')

                    img = renderer.render(
                        img,
                        frame_verts,
                        cam=frame_cam,
                        color=mc,
                        mesh_filename=mesh_filename,
                    )

                    if args.sideview:
                        side_img = renderer.render(
                            side_img,
                            frame_verts,
                            cam=frame_cam,
                            color=mc,
                            angle=270,
                            axis=[0,1,0],
                        )

                if args.sideview:
                    img = np.concatenate([img, side_img], axis=1)

                cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

                if args.display:
                    cv2.imshow('Video', img)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break

            if args.display:
                cv2.destroyAllWindows()

            # ========= Save rendered video ========= #
            vid_name = os.path.basename(video_file)
            save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
            save_name = os.path.join(output_path, save_name)
            print(f'Saving result video to {save_name}')
            images_to_video(img_folder=output_img_folder, output_vid_file=save_name)
            
            shutil.rmtree(output_img_folder)

    # Clean-up after processing
    del model
    
    
    shutil.rmtree(image_folder)

    print('================= END =================')
Exemplo n.º 4
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    video_file = 'test.mp4'
    output_folder = 'output/'
    output_path = os.path.join(
        output_folder,
        os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)
    print(f'the output path is:{output_path}')

    image_folder, num_frames, img_shape = generative()

    print(
        f'image_folder is:{image_folder},num_frames is:{num_frames},imshape is:{img_shape}'
    )

    orig_height, orig_width = img_shape[:2]
    total_time = time.time()
    # ========= Run tracking ========= #
    #  tracking_method = bbox
    bbox_scale = 1.1
    # run multi object tracker
    mot = MPT(
        device=device,
        batch_size=12,  # 12
        display=False,  # true
        detector_type='yolo',  # yolo
        output_format='dict',
        yolo_img_size=416,  #416 * 416
    )
    tracking_results = mot(image_folder)
    print(f'output the result of tracking->{tracking_results}')
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]
    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)
    # ========= Load pretrained weights ========= #
    # 加载预训练模型
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)

    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None
        # bboxes = n X 4
        bboxes = tracking_results[person_id]['bbox']
        ## frames n X 1
        frames = tracking_results[person_id]['frames']
        # 一个已经标准化了的图像 dataset  其总体大小为 T X 224 X 224 X 3
        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,  # none
            scale=bbox_scale,
        )
        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False
        dataloader = DataLoader(dataset, batch_size=64, num_workers=16)

        with torch.no_grad():
            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []
            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(
                    0)  #在dim = 0 增加一维张量  变为: 1 X T x 244 x 244 x 3
                batch = batch.to(device)
                batch_size, seqlen = batch.shape[:2]  # 得到 seqlen是一段时间序列图像

                output = model(batch)[-1]
                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))  # 1*T X 3
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))  # 1*T X 72
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))  # 1*T X 10
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))  #

            pred_cam = torch.cat(pred_cam, dim=0)  #(T, 3)
            pred_verts = torch.cat(pred_verts, dim=0)  #(T, ,3)
            pred_pose = torch.cat(pred_pose, dim=0)  #(T, 72)
            pred_betas = torch.cat(pred_betas, dim=0)  #(T, 10)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)  #(T, ,3)
            del batch
        # ========= Save results to a pickle file ========= #
        # 由tensor 转换成 numpy
        pred_cam = pred_cam.cpu().numpy()  #(T, 3)
        pred_verts = pred_verts.cpu().numpy()  #(T, ,3)
        pred_pose = pred_pose.cpu().numpy()  #(T, 72)
        pred_betas = pred_betas.cpu().numpy()  #(T, 10)
        pred_joints3d = pred_joints3d.cpu().numpy()  #(T, ,3)
        # 结果为: T X 4
        orig_cam = convert_crop_cam_to_orig_img(
            cam=pred_cam,  # 预测的 cam   T X 3
            bbox=bboxes,  # 目标检测的结果 T X 4
            img_width=orig_width,  # 最初图像的宽
            img_height=orig_height  # 最初图像的高
        )

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }
        vibe_results[person_id] = output_dict
    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)
    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.'
    )
    print(
        f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".'
    )

    joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))

    if not args.no_render:
        # ========= Render results as a single video ========= #
        # 图像高和宽 resolution
        ## 一个 工具 真确传参数即可使用
        renderer = Renderer(resolution=(orig_width, orig_height),
                            orig_img=True,
                            wireframe=args.wireframe)

        output_img_folder = f'{image_folder}_output'
        os.makedirs(output_img_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        frame_results = prepare_rendering_results(vibe_results, num_frames)
        mesh_color = {
            k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0)
            for k in vibe_results.keys()
        }

        image_file_names = sorted([
            os.path.join(image_folder, x) for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        for frame_idx in tqdm(range(len(image_file_names))):
            img_fname = image_file_names[frame_idx]
            # 读取图片
            img = cv2.imread(img_fname)

            # true
            if args.sideview:
                side_img = np.zeros_like(img)
            #
            for person_id, person_data in frame_results[frame_idx].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']
                mc = mesh_color[person_id]

                # 3D模型
                mesh_folder = os.path.join(output_path, 'meshes',
                                           f'{person_id:04d}')
                os.makedirs(mesh_folder, exist_ok=True)
                mesh_filename = os.path.join(mesh_folder,
                                             f'{frame_idx:06d}.obj')

                # 输出是一张图片
                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

                side_img = renderer.render(
                    side_img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    angle=270,
                    axis=[0, 1, 0],
                )
            img = np.concatenate([img, side_img], axis=1)

            cv2.imwrite(
                os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

    print('================= END =================')
Exemplo n.º 5
0
def main(args):
    if args.device == 'cpu':
        device = torch.device('cpu')
        print('Running on CPU')
    else:
        device = torch.device('cuda')
        print('Running on GPU')

    if args.vid_file:
        video_file = args.vid_file
        if not os.path.isfile(video_file):
            exit(f'Input video \"{video_file}\" does not exist!')
    else:
        image_file = args.img_file
        if not os.path.isfile(image_file):
            exit(f'Input video \"{image_file}\" does not exist!')

    output_path = os.path.join(
        args.output_folder,
        os.path.basename(video_file).replace('.mp4', ''))
    # output_path = os.path.join(args.output_folder, os.path.basename(video_file).split('.')[0])
    os.makedirs(output_path, exist_ok=True)

    image_folder, num_frames, img_shape = video_to_images(video_file,
                                                          return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # resize video if too big
    # ffmpeg -i input.avi -filter:v scale=720:-1 -c:a copy output.mkv

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file,
                                           staf_folder=args.staf_dir,
                                           display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    import pdb
    pdb.set_trace

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=True)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        with torch.no_grad():
            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(
                f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
            )
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print(
                '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
            )
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        vibe_results[person_id] = output_dict

    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.'
    )

    print(
        f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".'
    )

    joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))

    if not args.no_render:
        # ========= Render results as a single video ========= #
        renderer = Renderer(resolution=(orig_width, orig_height),
                            orig_img=True,
                            wireframe=args.wireframe)

        output_img_folder = f'{image_folder}_images'
        os.makedirs(output_img_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        output_pose_folder = f'{image_folder}_poses'
        os.makedirs(output_pose_folder, exist_ok=True)

        print(f'Saving poses to {output_pose_folder}')

        # prepare results for rendering

        from numpy import save
        save(f'{os.path.basename(video_file)}_poses.npy',
             vibe_results[1]['joints3d'][:, :25, :])
        print('Saving numpy poses file to' + f'{video_file}_poses.npy')

        frame_results = prepare_rendering_results(
            vibe_results,
            num_frames)  # returns a list of dicts (one dict for each person)
        mesh_color = {
            k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0)
            for k in vibe_results.keys()
        }

        image_file_names = sorted([
            os.path.join(image_folder, x) for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        for frame_idx in tqdm(range(len(image_file_names))):
            img_fname = image_file_names[frame_idx]
            img = cv2.imread(img_fname)

            if args.sideview:
                side_img = np.zeros_like(img)

            for person_id, person_data in frame_results[frame_idx].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']
                frame_pose = person_data['joints3d'][:25]

                mc = mesh_color[person_id]

                mesh_filename = None

                if args.save_obj:
                    mesh_folder = os.path.join(output_path, 'meshes',
                                               f'{person_id:04d}')
                    os.makedirs(mesh_folder, exist_ok=True)
                    mesh_filename = os.path.join(mesh_folder,
                                                 f'{frame_idx:06d}.obj')

                # bgr image (opencv format)
                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

                # import pdb; pdb.set_trace()
                # Create a 3D projection and save as img
                # pose is mirrored
                # plot_skeleton(output_pose_folder, frame_idx, frame_pose)

                if args.sideview:
                    side_img = renderer.render(
                        side_img,
                        frame_verts,
                        cam=frame_cam,
                        color=mc,
                        angle=270,
                        axis=[0, 1, 0],
                    )

            if args.sideview:
                img = np.concatenate([img, side_img], axis=1)

            # concatenate pose img with this image before writing
            cv2.imwrite(
                os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

        # ========= Save rendered video ========= #
        vid_name = os.path.basename(video_file)
        save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
        save_name = os.path.join(output_path, save_name)
        print(f'Saving result video to {save_name}')
        images_to_video(img_folder=output_img_folder,
                        output_vid_file=save_name)
        # shutil.rmtree(output_img_folder)

    shutil.rmtree(image_folder)
    print('================= END =================')
Exemplo n.º 6
0
def main(args):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    video_file = args.vid_file

    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')

        if video_file is None:
            exit('Youtube url is not valid!')

        print(f'YouTube Video has been downloaded to {video_file}...')

    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)

    image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, smpl_joints2d, norm_joints2d = [], [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3))
                smpl_joints2d.append(output['kp_2d']).reshape(batch_size * seqlen, -1, 2))


            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)
            smpl_joints2d = torch.cat(smpl_joints2d, dim=0)
            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )
Exemplo n.º 7
0
def main(args):

    # check GPU availability
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # get input video
    video_file = args.input_video

    # save hand info
    save_hand_csv = args.save_hand_csv

    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')
        if video_file is None:
            exit('Youtube url is not valid!')
        print(f'YouTube Video has been downloaded to {video_file}...')

    # check video existence
    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    # set output flles
    output_path = args.output_folder
    image_folder, num_frames, img_shape = video_to_images(video_file, \
                    "/tmp/" + output_path.split("/")[-1], return_info=True)
    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    # get the frame rate (frames per second) of the input video
    video = cv2.VideoCapture(video_file)
    fps = video.get(cv2.CAP_PROP_FPS)

    # ========= Run tracking ========= #
    bbox_scale = 1.1

    # run multi object tracker
    mot = MPT(
        device=device,
        batch_size=args.tracker_batch_size,
        detector_type=args.detector,
        output_format='dict',
        yolo_img_size=args.yolo_img_size,
    )
    tracking_results = mot(image_folder)

    # only focus on the frame with the first person
    largest_num_frames = 0
    largest_person = None
    for person_id in list(tracking_results.keys()):
        num_frames = tracking_results[person_id]['frames'].shape[0]
        if num_frames <= largest_num_frames:
            del tracking_results[person_id]
        else:
            largest_num_frames = tracking_results[person_id]['frames'].shape[0]
            if largest_person != None:
                del tracking_results[largest_person]
            largest_person = person_id

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_results = {}

    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = tracking_results[person_id]['bbox']
        frames = tracking_results[person_id]['frames']
        np.save(output_path + "/frames", frames)

        # inference data of each person
        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            scale=bbox_scale,
        )
        bboxes = dataset.bboxes
        frames = dataset.frames

        # load data
        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        # extract data
        with torch.no_grad():
            pred_cam, pred_verts, pred_pose = [], [], []
            for batch in dataloader:
                batch = batch.unsqueeze(0)
                batch = batch.to(device)
                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]
                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            del batch

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()

        # get camera pose
        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        # get result information
        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'bboxes': bboxes,
            'frame_ids': frames,
        }
        np.savetxt(output_path + "/pred_cam.csv", pred_cam, delimiter=",")
        np.savetxt(output_path + "/orig_cam.csv", orig_cam, delimiter=",")
        vibe_results[person_id] = output_dict
    del model

    frame_results = prepare_rendering_results(vibe_results, len(frames))
    np.save(output_path + "/frame_results", frame_results)
    np.save(output_path + "/image_folder", image_folder)
    np.save(output_path + "/orig_width", orig_width)
    np.save(output_path + "/orig_height", orig_height)
Exemplo n.º 8
0
    output_folder = args.output_folder
    ped_results = {}
    frames_ped_veh = {}
    ped, veh = mot(image_folder, output_folder=output_folder)

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    for person_id in tqdm(sorted(list(ped.keys()))):
        bboxes = joints2d = None

        bboxes = ped[person_id]['bbox']
        frames = ped[person_id]['frames']

        for frame_one in frames:
            if frame_one not in frames_ped_veh:
Exemplo n.º 9
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    dict = {
        'bridge': 1,
        'childs': 2,
        'downwarddog': 3,
        'mountain': 4,
        'plank': 5,
        'seatedforwardbend': 6,
        'tree': 7,
        'trianglepose': 8,
        'warrior1': 9,
        'warrior2': 10
    }
    dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/'
    output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/test_set/'

    joints3D_csv = open('output_joints3d_dog.csv', 'a')
    pose_csv = open('output_pose.csv_dog', 'a')

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    video_file = '/home/ubuntu/PoseEstimation/VIBE/DogVideo.mp4'
    video_label = dict['bridge']
    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    image_folder, num_frames, img_shape = video_to_images(video_file,
                                                          return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file,
                                           staf_folder=args.staf_dir,
                                           display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(
                f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
            )
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print(
                '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
            )
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        # Runs 1 Euro Filter to smooth out the results
        if args.smooth:
            min_cutoff = args.smooth_min_cutoff  # 0.004
            beta = args.smooth_beta  # 1.5
            print(
                f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}'
            )
            pred_verts, pred_pose, pred_joints3d = smooth_pose(
                pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta)

        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        for i in range(len(output_dict['joints3d'])):
            if (i % 5 == 0):
                flat_arr = output_dict['joints3d'][i].flatten()
                len_N = len(flat_arr)
                np.savetxt(joints3D_csv, [np.append(flat_arr, [video_label])],
                           delimiter=',',
                           fmt=' '.join(['%f'] * len_N + ['%i']))

        for i in range(len(output_dict['pose'])):
            if (i % 5 == 0):
                pose_arr = output_dict['pose'][i].flatten()
                len_M = len(pose_arr)
                np.savetxt(pose_csv, [np.append(pose_arr, [video_label])],
                           delimiter=',',
                           fmt=' '.join(['%f'] * len_M + ['%i']))

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.'
    )
Exemplo n.º 10
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    SEQ_LENGTH = args.sequence_length
    MIN_NUM_FRAMES = 1  # Don't change this
    TRACKER_BATCH_SIZE = MIN_NUM_FRAMES
    images_to_eval = []
    yolo_img_size = args.yolo_img_size

    image_folder = 'live_rendered_images'
    output_path = args.output_folder
    os.makedirs(image_folder, exist_ok=True)
    os.makedirs(output_path, exist_ok=True)
    os.makedirs('live_imgs', exist_ok=True)

    model = VIBE_Demo(seqlen=SEQ_LENGTH,
                      n_layers=2,
                      hidden_size=1024,
                      add_linear=True,
                      use_residual=True,
                      live_inference=True).to(device)

    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    mot = MPT(
        device=device,
        batch_size=TRACKER_BATCH_SIZE,
        display=False,
        detector_type=args.detector,
        output_format='dict',
        yolo_img_size=yolo_img_size,
    )

    # An asynchronous camera implementation to run cv2 camera in background while model is running
    cap = AsyncCamera(0, display=args.live_display)

    bbox_scale = 1.1

    i = 0
    bbox_lis, frame_lis, images_lis, joints2d_lis = [], [], [], []
    pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

    while (True):
        # If q is pressed cap.stop will turn True
        if (cap.stop):
            break

        ret, captured_frames = cap.read()
        if (not ret):
            continue
        if (len(captured_frames) < MIN_NUM_FRAMES):
            continue

        images = get_images_from_captures(captured_frames, MIN_NUM_FRAMES)

        cap.del_frame_lis()

        orig_height, orig_width = images[0].shape[:2]
        orig_dim = (orig_height, orig_width)

        saveToDir(images)
        if args.tracking_method == 'pose':
            images_to_video('./live_imgs', './live_imgs/pose_video.mp4')
            tracking_results = run_posetracker('live_imgs/pose_video.mp4',
                                               staf_folder=args.staf_dir,
                                               display=args.display)
        else:
            tracking_results = mot('./live_imgs')

        if args.live_display:
            cap.set_display_image(images[-1])

        if (len(tracking_results.keys()) == 0):
            print('Unable to detect any person')

        for image in images:
            images_lis.append(image)

        if len(tracking_results.keys()) != 0:

            person_id = (list)(tracking_results.keys())[0]
            print(person_id)
            frames = tracking_results[person_id]['frames']

            bboxes, joints2d = None, None

            if args.tracking_method == 'pose':
                joints2d = tracking_results[person_id]['joints2d']
                if (joints2d_lis == []):
                    joints2d_lis = joints2d
                else:
                    joints2d_lis = np.vstack([joints2d_lis, joints2d])
            else:
                bboxes = tracking_results[person_id]['bbox']
                if (bbox_lis == []):
                    bbox_lis = bboxes
                else:
                    bbox_lis = np.vstack([bbox_lis, bboxes])

            for x in (1 + i + frames - MIN_NUM_FRAMES):
                frame_lis.append(x)

            dataset = LiveInference(
                images=images_lis[-SEQ_LENGTH:],
                frames=frame_lis[-SEQ_LENGTH:],
                bboxes=bbox_lis[-SEQ_LENGTH:],
                joints2d=joints2d_lis[-SEQ_LENGTH:]
                if joints2d is not None else None,
                scale=bbox_scale,
            )

            bboxes = dataset.bboxes

            if args.tracking_method == 'pose':
                if (bbox_lis == []):
                    bbox_lis = bboxes
                else:
                    bbox_lis = np.vstack([bbox_lis, bboxes[-1:]])

            cap.set_bounding_box(bbox_lis[-1])

            has_keypoints = True if joints2d is not None else False
            norm_joints2d = []

            with torch.no_grad():

                # A manual implementation for getting data since dataloader is slow for few inputs
                tup = [
                    dataset.__getitem__(x) for x in range(dataset.__len__())
                ]

                if has_keypoints:
                    for j, batch in enumerate(tup):
                        tup[j], nj2d = batch
                        norm_joints2d.append(nj2d[:21, :].reshape(-1, 21, 3))

                for j, x in enumerate(tup):
                    tup[j] = x.unsqueeze(0)

                tup = tuple(tup)
                batch = torch.cat((tup), 0)

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]

                # Send only latest image to hmr for faster inferencing
                output = model(batch[:, -1:, :, :, :])[-1]

                pred_cam.append(
                    output['theta'][:, -MIN_NUM_FRAMES:, :3].reshape(
                        batch_size * MIN_NUM_FRAMES, -1))
                pred_verts.append(
                    output['verts'][:, -MIN_NUM_FRAMES:, ].reshape(
                        batch_size * MIN_NUM_FRAMES, -1, 3))
                pred_pose.append(
                    output['theta'][:, -MIN_NUM_FRAMES:, ][:, :, 3:75].reshape(
                        batch_size * MIN_NUM_FRAMES, -1))
                pred_betas.append(
                    output['theta'][:, -MIN_NUM_FRAMES:, ][:, :, 75:].reshape(
                        batch_size * MIN_NUM_FRAMES, -1))
                pred_joints3d.append(
                    output['kp_3d'][:, -MIN_NUM_FRAMES:, ].reshape(
                        batch_size * MIN_NUM_FRAMES, -1, 3))

                del batch

            pred_verts[-MIN_NUM_FRAMES:], pred_cam[
                -MIN_NUM_FRAMES:], pred_pose[-MIN_NUM_FRAMES:], pred_betas[
                    -MIN_NUM_FRAMES:], pred_joints3d[
                        -MIN_NUM_FRAMES:], norm_joints2d[
                            -MIN_NUM_FRAMES:] = temporal_simplify(
                                pred_verts[-MIN_NUM_FRAMES:],
                                pred_cam[-MIN_NUM_FRAMES:],
                                pred_pose[-MIN_NUM_FRAMES:],
                                pred_betas[-MIN_NUM_FRAMES:],
                                pred_joints3d[-MIN_NUM_FRAMES:],
                                norm_joints2d[-MIN_NUM_FRAMES:], device, args)

            get_vibe_results(
                pred_cam[-MIN_NUM_FRAMES:], pred_verts[-MIN_NUM_FRAMES:],
                pred_pose[-MIN_NUM_FRAMES:], pred_betas[-MIN_NUM_FRAMES:],
                pred_joints3d[-MIN_NUM_FRAMES:],
                joints2d_lis[-MIN_NUM_FRAMES:], bbox_lis[-MIN_NUM_FRAMES:],
                frame_lis[-MIN_NUM_FRAMES], orig_dim, 0)

        images = []
        i = i + 1

        if (i == args.max_frames):
            break

    del model

    vibe_results = get_vibe_results(pred_cam, pred_verts, pred_pose,
                                    pred_betas, pred_joints3d, joints2d_lis,
                                    bbox_lis, frame_lis, orig_dim, 0)

    if not args.no_render:
        for i, image in enumerate(images_lis):
            cv2.imwrite(f'{image_folder}/{(i):06d}.jpg', image)
        print(frame_lis)
        render(orig_dim, frame_lis, vibe_results, image_folder, output_path,
               len(images_lis), args)

    shutil.rmtree('live_imgs')
    print('================= END =================')
Exemplo n.º 11
0
def main(args):

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file, map_location=torch.device('cuda'))
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    # print(f'Running VIBE on each tracklet...')
    # vibe_time = time.time()
    # vibe_results = {}
    for person_id in tqdm(list([1])):
        bboxes = joints2d = None

        # if args.tracking_method == 'bbox':
        #     bboxes = tracking_results[person_id]['bbox']
        # elif args.tracking_method == 'pose':
        #     joints2d = tracking_results[person_id]['joints2d']

        # frames = tracking_results[person_id]['frames']
        # print('Frame shape---===',frames)

        image_folder = '/home/ubuntu/gyrusWork/DATA'
        dataset = customDataset3D(
            image_folder=image_folder,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True  # if joints2d is not None else False

        dataloader = DataLoader(dataset, batch_size=1, num_workers=8)

        epoch = 0
        epochs = 10

        start = time.time()

        summary_string = ''

        #bar = Bar(f'Epoch {epoch + 1}/{epochs}', fill='#', max=10)

        for i in range(epochs):
            epoch = i
            dataloader = DataLoader(dataset, batch_size=1, num_workers=8)

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                GT = {}
                if has_keypoints:
                    batch, nj2d, pose, bbox, raw_imgCrop, org_img, KP3d, cam_t = batch
                    #print('batch shape',batch.shape)
                    # img = Image.fromarray(np.asarray(batch[0]).reshape((224,224,3)), 'RGB')
                    # img.save(f"/home/ubuntu/gyrus/3D_pose/myimg{epoch}.jpg")
                    # time.sleep(5)
                    #norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                # pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1))
                # pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3))
                # pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3))

                # pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1))
                # pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1))

                pred_cam = output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1)
                pred_verts = output['verts'].reshape(batch_size * seqlen, -1,
                                                     3)
                pred_joints3d = output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3)

                #print('type----',type(pred_cam))

                # pred_cam = torch.cat(pred_cam, dim=0)
                # pred_verts = torch.cat(pred_verts, dim=0)
                # pred_pose = torch.cat(pred_pose, dim=0)
                # pred_betas = torch.cat(pred_betas, dim=0)
                # pred_joints3d = torch.cat(pred_joints3d, dim=0)

                pred_cam = pred_cam.cpu().detach().numpy()

                #print('gen output----------------',output['kp_3d'].shape)
                # print('gen output----------------',output['kp_2d'].shape)
                # print('gen output----------------',output['theta'].shape)

                w_smpl = torch.ones(seqlen).float()
                w_3d = torch.ones(seqlen).float()
                GT['w_smpl'] = w_smpl
                GT['w_3d'] = w_3d
                GT['kp_2d'] = nj2d
                GT['theta'] = pose
                GT['kp_3d'] = KP3d

                # kp2dX = {}

                #print('in trainCustom,line,408, output[2d ]shape',type(output['kp_2d'].cpu()),output['kp_2d'].cpu().shape )

                output[
                    'kp_2d'] = convert_crop_coords_to_orig_img(  #not orig image, convert to crop size
                        bbox=bbox,
                        keypoints=output['kp_2d'].cpu(),
                        crop_size=224,
                    )

                for key in output:
                    output[key] = output[key].to(torch.device("cuda"))

                for key in GT:
                    GT[key] = GT[key].to(torch.device("cuda"))

                #GT = GT.to(torch.device("cuda"))

                #print('in trainCustom,line,408, output[2d ]shape',type(output['kp_2d']),output['kp_2d'].shape )

                # print('img height width---',org_img[0,:,:,0].numpy().shape)

                # orig_height, orig_width = org_img[0,:,:,0].numpy().shape

                # print('Type check',type(pred_cam),type(bbox),type(orig_height))

                # orig_cam = convert_crop_cam_to_orig_img(
                #         cam=pred_cam,
                #         bbox=bbox.numpy(),
                #         img_width=orig_width,
                #         img_height=orig_height
                #     )

                pred_camera = torch.from_numpy(pred_cam)
                pred_cam_t = torch.stack([
                    pred_camera[:, 1], pred_camera[:, 2], 2 * 5000. /
                    (224. * pred_camera[:, 0] + 1e-9)
                ],
                                         dim=-1)

                #print('**************************',GT['kp_2d'][0].shape)

                # cv_keypoints =[]
                # cv_keypoints1 =[]
                # for x,y in output['kp_2d'].cpu()[0,0,6:11].tolist():
                #     cv_keypoints.append(cv2.KeyPoint(x, y,10))
                # for x,y in GT['kp_2d'][0,6:11].tolist():
                #     cv_keypoints1.append(cv2.KeyPoint(x, y,10))

                # raw_img = raw_imgCrop[0].numpy() #.permute(1, 2, 0).numpy()
                # #print('raw image shape===',raw_img.shape)
                # #raw_img = org_img[0].numpy()
                # #print('org_imgimage shape===',raw_img.shape)
                # cv2.drawKeypoints(raw_img, cv_keypoints, raw_img, color=(255,0,0))
                # cv2.drawKeypoints(raw_img, cv_keypoints1, raw_img, color=(0,0,0))
                # cv2.imwrite(f"/home/ubuntu/gyrusWork/myimg{epoch}.jpg",raw_img)

                ### 3D visualization

                #print('debug%%%%',output['kp_3d'].shape,type(output['kp_3d'].cpu().detach().numpy()))

                #skel = np.zeros((32,3))

                #tmpx = pred_verts.cpu().detach().numpy()
                tmpx = pred_joints3d.cpu().detach().numpy()

                #print('debug output[verts] SHAPE',tmpx.shape,type(tmpx))

                # skel[USE_DIMS] = tmpx[0]#[H36M_IDS]
                skel = tmpx[0]

                #visualize(skel.reshape(-1), skel, raw_imgCrop[0].numpy(),t=pred_cam_t.numpy())

                target_2d = False
                target_3d = GT
                gen_loss = loss(
                    generator_outputs=output,
                    data_2d=target_2d,
                    data_3d=target_3d,
                )

                print("LOSS--------", gen_loss)
                print('GT[kp_3d] shape', GT['kp_3d'].shape)
                print('ord 3d key pt==', GT['kp_3d'][0, 6:11],
                      '   pred 3d key pt==', output['kp_3d'][0, 0, 6:11])

                # <======= Backprop generator and discriminator
                gen_optimizer.zero_grad()
                gen_loss.backward()
                gen_optimizer.step()
            del batch
Exemplo n.º 12
0
def main(args):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    video_file = args.vid_file

    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')

        if video_file is None:
            exit('Youtube url is not valid!')

        print(f'YouTube Video has been downloaded to {video_file}...')

    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)

    image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None     

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        # reduce the num of worker if you encountered the error:  DLL load failed: The paging file is too small for this operation to complete
        dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=8)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3))


            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}')
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!')
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        orig_cam = convert_crop_cam_to_orig_img(
            cam=pred_cam,
            bbox=bboxes,
            img_width=orig_width,
            img_height=orig_height
        )

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        vibe_results[person_id] = output_dict

    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(f'Total time spent: {total_time:.2f} seconds (including model loading time).')
    print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.')

    print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".')

    joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))

    if not args.no_render:
        # ========= Render results as a single video ========= #
        renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe)

        output_img_folder = f'{image_folder}_output'
        os.makedirs(output_img_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        # prepare results for rendering
        frame_results = prepare_rendering_results(vibe_results, num_frames)
        mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()}

        image_file_names = sorted([
            os.path.join(image_folder, x)
            for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        for frame_idx in tqdm(range(len(image_file_names))):
            img_fname = image_file_names[frame_idx]
            img = cv2.imread(img_fname)

            if args.sideview:
                side_img = np.zeros_like(img)

            for person_id, person_data in frame_results[frame_idx].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']

                mc = mesh_color[person_id]

                mesh_filename = None

                if args.save_obj:
                    mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}')
                    os.makedirs(mesh_folder, exist_ok=True)
                    mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj')

                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

                if args.sideview:
                    side_img = renderer.render(
                        side_img,
                        frame_verts,
                        cam=frame_cam,
                        color=mc,
                        angle=270,
                        axis=[0,1,0],
                    )

            if args.sideview:
                img = np.concatenate([img, side_img], axis=1)

            font = cv2.FONT_HERSHEY_SIMPLEX
            x = 10 #position of text
            y = 20 #position of text
            cv2.putText(img, str(frame_idx), (x,y), font ,0.55,(0,255,0),1)
            cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

        # ========= Save rendered video ========= #
        vid_name = os.path.basename(video_file)
        save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
        save_name = os.path.join(output_path, save_name)
        print(f'Saving result video to {save_name}')
        images_to_video(img_folder=output_img_folder, output_vid_file=save_name)
        shutil.rmtree(output_img_folder)

    shutil.rmtree(image_folder)

    # generate and save the joints csv file for animating avatars later
    output = joblib.load(os.path.join(output_path, "vibe_output.pkl"))
    for i in output.keys():
        print('Track ids:', i , end='\n\n')

    num_ppl = len(output.keys())

    print('VIBE output file content:', end='\n\n')

    vid_name = os.path.basename(video_file)
    vibe_result_folder = output_path
    # output the pose result as csv
    # format: v_personId_numFrames
    pose_filename_list = []

    for i in output.keys():
      pose_filename = vibe_result_folder + "/" + vid_name + "_"+ str(i) + "_" +  str(output[i]['pose'].shape[0]) + ".csv"
      pose_filename_list.append(pose_filename)
      field_names = []
      for idx in range(73): # 72 -> 73 (+ frame_id at 0)
        field_names.append(str(idx))
        
      with open(pose_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(field_names)
        for frame_id in range(len(output[i]['pose'])):
          output_data = [output[i]['frame_ids'][frame_id]]
          output_data.extend(output[i]['pose'][frame_id])
          #print(output_data)
          writer.writerow(output_data) 


    print('================= END =================')
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    map_vals = {
        'bridge': 1,
        'childs': 2,
        'downwarddog': 3,
        'mountain': 4,
        'plank': 5,
        'seatedforwardbend': 6,
        'tree': 7,
        'trianglepose': 8,
        'warrior1': 9,
        'warrior2': 10
    }

    inverse_map = {
        1: 'bridge',
        2: 'childs',
        3: 'downwarddog',
        4: 'mountain',
        5: 'plank',
        6: 'seatedforwardbend',
        7: 'tree',
        8: 'trianglepose',
        9: 'warrior1',
        10: 'warrior2'
    }

    video_file = args.vid_file
    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')

        if video_file is None:
            exit('Youtube url is not valid!')

        print(f'YouTube Video has been downloaded to {video_file}...')

    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/'
    output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/'

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load Classification Model ========= #
    classification_model = pickle.load(
        open('view_classification_model.pkl', 'rb'))

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    #print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    #print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    image_folder, num_frames, img_shape = video_to_images(video_file,
                                                          return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file,
                                           staf_folder=args.staf_dir,
                                           display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Run VIBE on each person ========= #
    #print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in list(tracking_results.keys()):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(
                f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
            )
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print(
                '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
            )
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        # Runs 1 Euro Filter to smooth out the results
        if args.smooth:
            min_cutoff = args.smooth_min_cutoff  # 0.004
            beta = args.smooth_beta  # 1.5
            print(
                f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}'
            )
            pred_verts, pred_pose, pred_joints3d = smooth_pose(
                pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta)

        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }
        # ========= Extract 3D joint feature for each frame ========= #
        list_val = []
        for i in range(len(output_dict['joints3d'])):
            list_val.append(output_dict['joints3d'][i].flatten().reshape(
                1, -1))

        input_df = pd.DataFrame(np.concatenate(list_val))
        input_df = input_df.round(2)
        predicted_classes = classification_model.predict_classes(input_df)
        output_df = pd.DataFrame(predicted_classes)
        # ========= Printing all possible poses detected for the video ========= #
        total_frames = len(output_df)
        print(
            '\nPrinting probabilities for yoga poses predicted in different frames.'
        )
        for i, v in output_df.value_counts().items():
            val = round((v / total_frames) * 100, 2)
            print('Probability of the yoga pose being ' +
                  inverse_map[i[0]].capitalize() + " is: " + str(val))
        print('\nThe yoga pose in the given video is: ' +
              inverse_map[output_df[0].value_counts().idxmax()].capitalize())
Exemplo n.º 14
0
def main(args):
    torch.cuda.set_device(args.gpu_id)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    print(f'Loading video list {args.video_list}')
    video_list = [l.strip() for l in open(args.video_list, 'r').readlines()]
    if len(video_list) < 1:
        print('No files were found in video list')
        return

    print('Loading VIBE model')
    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load VIBE pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    num_videos = len(video_list)
    print(f'Processing {num_videos} videos.')
    for video_idx, video_file in enumerate(video_list, start=1):
        if not osp.isfile(video_file):
            print(
                f'Input video \"{video_file}\" does not exist! Moving on to next file.'
            )
            continue

        filename = osp.splitext(osp.basename(video_file))[0]
        output_path = osp.join(args.output_folder, filename)
        os.makedirs(output_path, exist_ok=True)

        image_folder, num_frames, img_shape = video_to_images(video_file,
                                                              return_info=True)

        print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames')
        orig_height, orig_width = img_shape[:2]

        # ========= Run tracking ========= #
        bbox_scale = 1.1
        if args.tracking_method == 'pose':
            if not osp.isabs(video_file):
                video_file = osp.join(os.getcwd(), video_file)
            tracking_results = run_posetracker(video_file,
                                               staf_folder=args.staf_dir,
                                               display=args.display)
        else:
            # run multi object tracker
            mot = MPT(
                device=device,
                batch_size=args.tracker_batch_size,
                display=args.display,
                detector_type=args.detector,
                output_format='dict',
                yolo_img_size=args.yolo_img_size,
            )
            tracking_results = mot(image_folder)

        # remove tracklets if num_frames is less than MIN_NUM_FRAMES
        for person_id in list(tracking_results.keys()):
            if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
                del tracking_results[person_id]

        # ========= Run VIBE on each person ========= #
        print(f'Running VIBE on each tracklet...')
        vibe_results = {}
        for person_id in tqdm(list(tracking_results.keys())):
            bboxes = joints2d = None

            if args.tracking_method == 'bbox':
                bboxes = tracking_results[person_id]['bbox']
            elif args.tracking_method == 'pose':
                joints2d = tracking_results[person_id]['joints2d']

            frames = tracking_results[person_id]['frames']

            dataset = Inference(
                image_folder=image_folder,
                frames=frames,
                bboxes=bboxes,
                joints2d=joints2d,
                scale=bbox_scale,
            )

            bboxes = dataset.bboxes
            frames = dataset.frames
            has_keypoints = True if joints2d is not None else False

            dataloader = DataLoader(dataset,
                                    batch_size=args.vibe_batch_size,
                                    num_workers=16)

            with torch.no_grad():

                pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

                for batch in dataloader:
                    if has_keypoints:
                        batch, nj2d = batch
                        norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                    batch = batch.unsqueeze(0)
                    batch = batch.to(device)

                    batch_size, seqlen = batch.shape[:2]
                    output = model(batch)[-1]

                    pred_cam.append(output['theta'][:, :, :3].reshape(
                        batch_size * seqlen, -1))
                    pred_verts.append(output['verts'].reshape(
                        batch_size * seqlen, -1, 3))
                    pred_pose.append(output['theta'][:, :, 3:75].reshape(
                        batch_size * seqlen, -1))
                    pred_betas.append(output['theta'][:, :, 75:].reshape(
                        batch_size * seqlen, -1))
                    pred_joints3d.append(output['kp_3d'].reshape(
                        batch_size * seqlen, -1, 3))

                pred_cam = torch.cat(pred_cam, dim=0)
                pred_verts = torch.cat(pred_verts, dim=0)
                pred_pose = torch.cat(pred_pose, dim=0)
                pred_betas = torch.cat(pred_betas, dim=0)
                pred_joints3d = torch.cat(pred_joints3d, dim=0)

                del batch

            # ========= [Optional] run Temporal SMPLify to refine the results ========= #
            if args.run_smplify and args.tracking_method == 'pose':
                norm_joints2d = np.concatenate(norm_joints2d, axis=0)
                norm_joints2d = convert_kps(norm_joints2d,
                                            src='staf',
                                            dst='spin')
                norm_joints2d = torch.from_numpy(norm_joints2d).float().to(
                    device)

                # Run Temporal SMPLify
                update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
                new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                    pred_rotmat=pred_pose,
                    pred_betas=pred_betas,
                    pred_cam=pred_cam,
                    j2d=norm_joints2d,
                    device=device,
                    batch_size=norm_joints2d.shape[0],
                    pose2aa=False,
                )

                # update the parameters after refinement
                print(
                    f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
                )
                pred_verts = pred_verts.cpu()
                pred_cam = pred_cam.cpu()
                pred_pose = pred_pose.cpu()
                pred_betas = pred_betas.cpu()
                pred_joints3d = pred_joints3d.cpu()
                pred_verts[update] = new_opt_vertices[update]
                pred_cam[update] = new_opt_cam[update]
                pred_pose[update] = new_opt_pose[update]
                pred_betas[update] = new_opt_betas[update]
                pred_joints3d[update] = new_opt_joints3d[update]

            elif args.run_smplify and args.tracking_method == 'bbox':
                print(
                    '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
                )
                print(
                    '[WARNING] Continuing without running Temporal SMPLify!..')

            # ========= Save results to a pickle file ========= #
            pred_cam = pred_cam.cpu().numpy()
            pred_verts = pred_verts.cpu().numpy()
            pred_pose = pred_pose.cpu().numpy()
            pred_betas = pred_betas.cpu().numpy()
            pred_joints3d = pred_joints3d.cpu().numpy()

            orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                    bbox=bboxes,
                                                    img_width=orig_width,
                                                    img_height=orig_height)

            output_dict = {
                'pred_cam': pred_cam,
                'orig_cam': orig_cam,
                'verts': pred_verts,
                'pose': pred_pose,
                'betas': pred_betas,
                'joints3d': pred_joints3d,
                'joints2d': joints2d,
                'bboxes': bboxes,
                'frame_ids': frames,
            }

            vibe_results[person_id] = output_dict

        # Clean-up the temporal folder
        shutil.rmtree(image_folder)

        # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path)
        output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl')
        print(f'Saving output results to \"{output_pkl_path}\".')
        joblib.dump(vibe_results, output_pkl_path)

    # Clean-up after processing
    del model

    print('================= END =================')
Exemplo n.º 15
0
def run_vibe(video_file, args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Make output dirs
    output_path = os.path.join(
        args.output_folder, os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)

    # Convert video to images
    image_folder, num_frames, img_shape = video_to_images(
        video_file, return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    if not os.path.isabs(video_file):
        video_file = os.path.join(os.getcwd(), video_file)

    tracking_results = run_posetracker(
        video_file, staf_folder=args.staf_dir, display=args.display, smoothen=args.smoothen, smoothen_method=args.smoothen_method)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):

        joints2d = tracking_results[person_id]['joints2d']
        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=None,
            joints2d=joints2d
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(
            dataset, batch_size=args.vibe_batch_size, num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [
            ], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        norm_joints2d = np.concatenate(norm_joints2d, axis=0)
        norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
        norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

        # Run Temporal SMPLify
        update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

        # update the parameters after refinement
        print(
            f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}')
        pred_verts = pred_verts.cpu()
        pred_cam = pred_cam.cpu()
        pred_pose = pred_pose.cpu()
        pred_betas = pred_betas.cpu()
        pred_joints3d = pred_joints3d.cpu()
        pred_verts[update] = new_opt_vertices[update]
        pred_cam[update] = new_opt_cam[update]
        pred_pose[update] = new_opt_pose[update]
        pred_betas[update] = new_opt_betas[update]
        pred_joints3d[update] = new_opt_joints3d[update]


        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        orig_cam = convert_crop_cam_to_orig_img(
            cam=pred_cam,
            bbox=bboxes,
            img_width=orig_width,
            img_height=orig_height
        )

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        vibe_results[person_id] = output_dict

    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).')
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.')

    print(
        f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".')

    # joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))
    for person in vibe_results.keys():
        dump_path = os.path.join(output_path, "vibe_output_%s.pkl" % person)
        os.makedirs(os.path.dirname(dump_path), exist_ok=True)
        pickle.dump(vibe_results[person], open(dump_path, 'wb'))

    # if not args.no_render:
    #     # ========= Render results as a single video ========= #
    #     renderer = Renderer(resolution=(orig_width, orig_height),
    #                         orig_img=True, wireframe=args.wireframe)

    #     output_img_folder = f'{image_folder}_output'
    #     os.makedirs(output_img_folder, exist_ok=True)

    #     print(f'Rendering output video, writing frames to {output_img_folder}')

    #     # prepare results for rendering
    #     frame_results = prepare_rendering_results(vibe_results, num_frames)
    #     mesh_color = {k: colorsys.hsv_to_rgb(
    #         np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()}

    #     image_file_names = sorted([
    #         os.path.join(image_folder, x)
    #         for x in os.listdir(image_folder)
    #         if x.endswith('.png') or x.endswith('.jpg')
    #     ])

    #     for frame_idx in tqdm(range(len(image_file_names))):
    #         img_fname = image_file_names[frame_idx]
    #         img = cv2.imread(img_fname)

    #         for person_id, person_data in frame_results[frame_idx].items():
    #             frame_verts = person_data['verts']
    #             frame_cam = person_data['cam']

    #             mc = mesh_color[person_id]

    #             mesh_filename = None

    #             img = renderer.render(
    #                 img,
    #                 frame_verts,
    #                 cam=frame_cam,
    #                 color=mc,
    #                 mesh_filename=mesh_filename,
    #             )

    #         cv2.imwrite(os.path.join(output_img_folder,
    #                                  f'{frame_idx:06d}.png'), img)

    #         if args.display:
    #             cv2.imshow('Video', img)
    #             if cv2.waitKey(1) & 0xFF == ord('q'):
    #                 break

    #     if args.display:
    #         cv2.destroyAllWindows()

    #     # ========= Save rendered video ========= #
    #     vid_name = os.path.basename(video_file)
    #     save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
    #     save_name = os.path.join(output_path, save_name)
    #     print(f'Saving result video to {save_name}')
    #     images_to_video(img_folder=output_img_folder,
    #                     output_vid_file=save_name)
    #     shutil.rmtree(output_img_folder)

    shutil.rmtree(image_folder)
    print('================= END =================')
Exemplo n.º 16
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Configure depth and color streams
    pipeline = rs.pipeline()
    config = rs.config()
    config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
    config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)

    # Start streaming
    pipeline.start(config)
    # ========= Run tracking ========= #
    bbox_scale = 1.1
    # run multi object tracker
    mot = MPT(
        device=device,
        batch_size=args.tracker_batch_size,
        display=args.display,
        detector_type=args.detector,
        output_format='dict',
        yolo_img_size=args.yolo_img_size,
    )
    # ========= Define VIBE model ========= #

    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= main loop ======================= #
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    #cap = cv2.VideoCapture('test.avi')
    #out = cv2.VideoWriter('output/test_5people.avi', fourcc, 30.0, (640, 360), True)
    #cap = cv2.VideoCapture('sample_video.mp4')
    out = cv2.VideoWriter('output/test_realsense_2.avi', fourcc, 10.0,
                          (640, 480), True)

    # load renderer
    renderer = Renderer(resolution=(640, 480),
                        orig_img=True,
                        wireframe=args.wireframe)

    i = 0
    time_acc = 0.0
    while (True):
        # Capture frame-by-frame
        total_time = time.time()
        frames = pipeline.wait_for_frames()
        frame_orig = frames.get_color_frame()

        # Convert images to numpy arrays
        frame_orig = np.asanyarray(frame_orig.get_data())

        #ret, frame_orig = cap.read()
        if frame_orig is None:
            break
    #for i in range(1,300):
    #    total_time = time.time()
    #    path = os.path.join('tmp/sample_video/',f'{i:06d}.png')
    #    frame_orig = cv2.imread(path)
        orig_height, orig_width = frame_orig.shape[:2]
        frame = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB)
        frame = frame / 255.
        frame = frame.transpose((2, 0, 1))
        frame = torch.from_numpy(frame)
        frame = frame.unsqueeze(0)
        tracking_results = mot(frame)
        #print('1111111111111tracking result',tracking_results)

        #print(f'Running VIBE on each tracklet...')
        vibe_time = time.time()
        vibe_results = {}
        for person_id in list(tracking_results.keys()):
            bboxes = joints2d = None

            bboxes = tracking_results[person_id]['bbox']  # shape(1,4)
            #print('bboxes:  ',bboxes) #相同

            frames = tracking_results[person_id]['frames']
            #print('22222222',bboxes)
            dataset = Inference(frame=frame_orig,
                                bboxes=bboxes,
                                scale=bbox_scale)

            with torch.no_grad():

                pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

                batch = dataset
                batch = batch.unsqueeze(0).unsqueeze(0)
                batch = batch.to(device)
                #print(batch.shape)
                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

                pred_cam = torch.cat(pred_cam, dim=0)
                pred_verts = torch.cat(pred_verts, dim=0)
                pred_pose = torch.cat(pred_pose, dim=0)
                pred_betas = torch.cat(pred_betas, dim=0)
                pred_joints3d = torch.cat(pred_joints3d, dim=0)

                del batch

            pred_cam = pred_cam.cpu().numpy()
            #print('pred_cam:  ',pred_cam)  #不同
            pred_verts = pred_verts.cpu().numpy()
            pred_pose = pred_pose.cpu().numpy()
            pred_betas = pred_betas.cpu().numpy()
            pred_joints3d = pred_joints3d.cpu().numpy()
            #print('3333333333',pred_cam)
            orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                    bbox=bboxes,
                                                    img_width=orig_width,
                                                    img_height=orig_height)
            #print('orig_cam',orig_cam.shape)
            output_dict = {
                'pred_cam': pred_cam,
                'orig_cam': orig_cam,
                'verts': pred_verts,
                'pose': pred_pose,
                'betas': pred_betas,
                'joints3d': pred_joints3d,
                'joints2d': joints2d,
                'bboxes': bboxes,
                'frame_ids': frames,
            }

            vibe_results[person_id] = output_dict

        #print('vibe_results orig_cam:  ',vibe_results[1]['orig_cam'])
        #print('vibe_results pose:  ', vibe_results[1]['pose'])
        end = time.time()
        fps = 1 / (end - vibe_time)

        print(f'VIBE FPS: {fps:.2f}')

        if not args.no_render:
            render_time = time.time()
            # load renderer
            #renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe)
            # prepare results for rendering
            num_frames = 1
            #print('vibe_results1111: ',vibe_results)
            #vibe_results[1]['orig_cam'] = vibe_results[1]['orig_cam'][np.newaxis,:]
            #print('orig_cam:   ',vibe_results[1]['orig_cam'].shape)
            frame_results = prepare_rendering_results(vibe_results, num_frames)
            #print('frame_results',frame_results)
            img = frame_orig
            mesh_color = {
                k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0)
                for k in vibe_results.keys()
            }

            #img = frame

            if args.sideview:
                side_img = np.zeros_like(img)

            for person_id, person_data in frame_results[0].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']
                #print('4444444444frame_cam',frame_cam)
                mc = mesh_color[person_id]

                mesh_filename = None

                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

            fps = 1 / (time.time() - render_time)
            print(f'RENDER FPS: {fps:.2f}')

            #img = img.numpy()
            out.write(img)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

        total_time = time.time() - total_time
        i += 1
        time_acc += total_time
        print(
            'num of frame: ', i,
            f'  Total time spent: {total_time:.2f} seconds (detect+track+vibe+render).'
        )
        print(f'FPS : { 1 / total_time:.2f}.')
    print('Total average FPS: ', i / time_acc)
    # ========= Save rendered video ========= #
    print('================= END =================')
Exemplo n.º 17
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    video_file = args.vid_file

    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')

        if video_file is None:
            exit('Youtube url is not valid!')

        print(f'YouTube Video has been downloaded to {video_file}...')

    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    output_path = os.path.join(
        args.output_folder,
        os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)

    image_folder, num_frames, img_shape = video_to_images(video_file,
                                                          return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file,
                                           staf_folder=args.staf_dir,
                                           display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(
                f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
            )
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print(
                '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
            )
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        # Runs 1 Euro Filter to smooth out the results
        if args.smooth:
            min_cutoff = args.smooth_min_cutoff  # 0.004
            beta = args.smooth_beta  # 1.5
            print(
                f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}'
            )
            pred_verts, pred_pose, pred_joints3d = smooth_pose(
                pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta)

        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        vibe_results[person_id] = output_dict

    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.'
    )

    print(
        f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".'
    )

    joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))

    if not args.no_render:
        # ========= Render results as a single video ========= #
        renderer = Renderer(resolution=(orig_width, orig_height),
                            orig_img=True,
                            wireframe=args.wireframe)

        output_img_folder = f'{image_folder}_output'
        os.makedirs(output_img_folder, exist_ok=True)

        if args.joints3dview:
            output_img_raw_folder = f'{image_folder}_raw_output'
            os.makedirs(output_img_raw_folder, exist_ok=True)

            output_img_joints3d_folder = f'{image_folder}_joints3d_output'
            os.makedirs(output_img_joints3d_folder, exist_ok=True)

            output_img_mesh_folder = f'{image_folder}_mesh_output'
            os.makedirs(output_img_mesh_folder, exist_ok=True)

            output_img_meshside_folder = f'{image_folder}_meshside_output'
            os.makedirs(output_img_meshside_folder, exist_ok=True)

            output_img_all_folder = f'{image_folder}_all_output'
            os.makedirs(output_img_all_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        # prepare results for rendering
        frame_results = prepare_rendering_results(vibe_results, num_frames)
        mesh_color = {
            k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0)
            for k in vibe_results.keys()
        }

        image_file_names = sorted([
            os.path.join(image_folder, x) for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        length_image_files = len(image_file_names)
        #length_image_files = 100
        for frame_idx in tqdm(range(length_image_files)):
            img_fname = image_file_names[frame_idx]
            img = cv2.imread(img_fname)

            if args.sideview:
                side_img = np.zeros_like(img)

            if args.joints3dview:
                img_raw = img.copy()
                img_joints3d = np.zeros_like(img)
                joints3d_list = []

            for person_id, person_data in frame_results[frame_idx].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']
                joints3d = person_data['joints3d']
                #print('frame_verts.shape = {}\nframe_cam.shape ={}\njoints3d.shape = {}'.format(
                #   frame_verts.shape, frame_cam.shape, joints3d.shape))
                mc = mesh_color[person_id]

                if args.joints3dview:
                    joints3d_list.append(joints3d)
                #    img_joints3d = render_joints3d(joints3d, img_raw.shape)

                mesh_filename = None

                if args.save_obj:
                    mesh_folder = os.path.join(output_path, 'meshes',
                                               f'{person_id:04d}')
                    os.makedirs(mesh_folder, exist_ok=True)
                    mesh_filename = os.path.join(mesh_folder,
                                                 f'{frame_idx:06d}.obj')

                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

                if args.sideview:
                    side_img = renderer.render(
                        side_img,
                        frame_verts,
                        cam=frame_cam,
                        color=mc,
                        angle=270,
                        axis=[0, 1, 0],
                    )

            if args.sideview:
                img_mesh = img.copy()
                img = np.concatenate([img, side_img], axis=1)

            cv2.imwrite(
                os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

            if args.joints3dview:
                #img_joints3d = np.zeros_like(img_raw)
                if len(joints3d_list) == 0:
                    img_joints3d = np.zeros_like(img_raw)
                else:
                    joints3d = np.concatenate(joints3d_list)
                    img_joints3d = render_joints3d(joints3d, img_raw.shape)

            if args.joints3dview:
                img_up = np.concatenate([img_raw, img_joints3d], axis=1)
                img_down = np.concatenate([img_mesh, side_img], axis=1)
                img_all = np.concatenate([img_up, img_down], axis=0)

                cv2.imwrite(
                    os.path.join(output_img_raw_folder,
                                 f'{frame_idx:06d}.png'), img_raw)
                cv2.imwrite(
                    os.path.join(output_img_joints3d_folder,
                                 f'{frame_idx:06d}.png'), img_joints3d)
                cv2.imwrite(
                    os.path.join(output_img_mesh_folder,
                                 f'{frame_idx:06d}.png'), img_mesh)
                cv2.imwrite(
                    os.path.join(output_img_meshside_folder,
                                 f'{frame_idx:06d}.png'), side_img)
                cv2.imwrite(
                    os.path.join(output_img_all_folder,
                                 f'{frame_idx:06d}.png'), img_all)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

        # ========= Save rendered video ========= #
        vid_name = os.path.basename(video_file)
        save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
        save_name = os.path.join(output_path, save_name)
        print(f'Saving result video to {save_name}')
        images_to_video(img_folder=output_img_folder,
                        output_vid_file=save_name)
        shutil.rmtree(output_img_folder)

        if args.joints3dview:
            '''
            save_name_raw = f'{vid_name.replace(".mp4", "")}_raw.mp4'
            save_name_raw = os.path.join(output_path, save_name_raw)
            images_to_video(img_folder=output_img_raw_folder, output_vid_file=save_name_raw)
            shutil.rmtree(output_img_raw_folder)

            save_name_joints3d = f'{vid_name.replace(".mp4", "")}_joints3d.mp4'
            save_name_joints3d = os.path.join(output_path, save_name_joints3d)
            images_to_video(img_folder=output_img_joints3d_folder, output_vid_file=save_name_joints3d)
            shutil.rmtree(output_img_joints3d_folder)

            save_name_mesh = f'{vid_name.replace(".mp4", "")}_mesh.mp4'
            save_name_mesh = os.path.join(output_path, save_name_mesh)
            images_to_video(img_folder=output_img_mesh_folder, output_vid_file=save_name_mesh)
            shutil.rmtree(output_img_mesh_folder)

            save_name_meshside = f'{vid_name.replace(".mp4", "")}_meshside.mp4'
            save_name_meshside = os.path.join(output_path, save_name_meshside)
            images_to_video(img_folder=output_img_meshside_folder, output_vid_file=save_name_meshside)
            shutil.rmtree(output_img_meshside_folder)
            '''
            save_name_all = f'{vid_name.replace(".mp4", "")}_all.mp4'
            save_name_all = os.path.join(output_path, save_name_all)
            images_to_video(img_folder=output_img_all_folder,
                            output_vid_file=save_name_all)
            shutil.rmtree(output_img_all_folder)

    shutil.rmtree(image_folder)
    print('================= END =================')
Exemplo n.º 18
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        device=device,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    ckpt = torch.load(pretrained_file, map_location=device)
    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    total_time = time.time()
    # ========= Run VIBE on crops ========= #
    print(f'Running VIBE on crops...')
    vibe_time = time.time()
    image_folder = args.input_folder

    dataset = InferenceFromCrops(image_folder=image_folder)
    orig_height = orig_width = 512

    dataloader = DataLoader(dataset,
                            batch_size=args.vibe_batch_size,
                            num_workers=0)

    with torch.no_grad():

        pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

        for batch_num, batch in enumerate(dataloader):
            print("BATCH:", batch_num)
            batch = batch.unsqueeze(0)
            batch = batch.to(device)

            batch_size, seqlen = batch.shape[:2]
            output = model(batch)[-1]

            pred_cam.append(output['theta'][:, :, :3].reshape(
                batch_size * seqlen, -1))
            pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1,
                                                      3))
            pred_pose.append(output['theta'][:, :, 3:75].reshape(
                batch_size * seqlen, -1))
            pred_betas.append(output['theta'][:, :, 75:].reshape(
                batch_size * seqlen, -1))
            pred_joints3d.append(output['kp_3d'].reshape(
                batch_size * seqlen, -1, 3))

        pred_cam = torch.cat(pred_cam, dim=0)
        pred_verts = torch.cat(pred_verts, dim=0)
        pred_pose = torch.cat(pred_pose, dim=0)
        pred_betas = torch.cat(pred_betas, dim=0)
        pred_joints3d = torch.cat(pred_joints3d, dim=0)

        del batch

    # ========= [Optional] run Temporal SMPLify to refine the results ========= #
    if args.run_smplify and args.tracking_method == 'pose':
        norm_joints2d = np.concatenate(norm_joints2d, axis=0)
        norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
        norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

        # Run Temporal SMPLify
        update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
        new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
            pred_rotmat=pred_pose,
            pred_betas=pred_betas,
            pred_cam=pred_cam,
            j2d=norm_joints2d,
            device=device,
            batch_size=norm_joints2d.shape[0],
            pose2aa=False,
        )

        # update the parameters after refinement
        print(
            f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
        )
        pred_verts = pred_verts.cpu()
        pred_cam = pred_cam.cpu()
        pred_pose = pred_pose.cpu()
        pred_betas = pred_betas.cpu()
        pred_joints3d = pred_joints3d.cpu()
        pred_verts[update] = new_opt_vertices[update]
        pred_cam[update] = new_opt_cam[update]
        pred_pose[update] = new_opt_pose[update]
        pred_betas[update] = new_opt_betas[update]
        pred_joints3d[update] = new_opt_joints3d[update]

    elif args.run_smplify and args.tracking_method == 'bbox':
        print(
            '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
        )
        print('[WARNING] Continuing without running Temporal SMPLify!..')

    # ========= Save results to a pickle file ========= #
    output_path = image_folder.replace('cropped_frames', 'vibe_results')
    os.makedirs(output_path, exist_ok=True)

    pred_cam = pred_cam.cpu().numpy()
    pred_verts = pred_verts.cpu().numpy()
    pred_pose = pred_pose.cpu().numpy()
    pred_betas = pred_betas.cpu().numpy()
    pred_joints3d = pred_joints3d.cpu().numpy()

    vibe_results = {
        'pred_cam': pred_cam,
        'verts': pred_verts,
        'pose': pred_pose,
        'betas': pred_betas,
        'joints3d': pred_joints3d,
    }

    del model
    end = time.time()
    fps = len(dataset) / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {len(dataset) / total_time:.2f}.'
    )

    print(
        f'Saving vibe results to \"{os.path.join(output_path, "vibe_results.pkl")}\".'
    )

    with open(os.path.join(output_path, "vibe_results.pkl"), 'wb') as f_save:
        pickle.dump(vibe_results, f_save)

    if not args.no_render:
        # ========= Render results as a single video ========= #
        renderer = Renderer(resolution=(orig_width, orig_height),
                            orig_img=True,
                            wireframe=args.wireframe)

        output_img_folder = os.path.join(output_path, 'vibe_images')
        os.makedirs(output_img_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        image_file_names = sorted([
            os.path.join(image_folder, x) for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        for frame_idx in tqdm(range(len(image_file_names))):
            img_fname = image_file_names[frame_idx]
            img = cv2.imread(img_fname)

            frame_verts = vibe_results['verts'][frame_idx]
            frame_cam = vibe_results['pred_cam'][frame_idx]

            mesh_filename = None

            if args.save_obj:
                mesh_folder = os.path.join(output_path, 'vibe_meshes')
                os.makedirs(mesh_folder, exist_ok=True)
                mesh_filename = os.path.join(mesh_folder,
                                             f'{frame_idx:06d}.obj')

            rend_img = renderer.render(
                img,
                frame_verts,
                cam=frame_cam,
                mesh_filename=mesh_filename,
            )

            whole_img = rend_img

            if args.sideview:
                side_img_bg = np.zeros_like(img)
                side_rend_img90 = renderer.render(
                    side_img_bg,
                    frame_verts,
                    cam=frame_cam,
                    angle=90,
                    axis=[0, 1, 0],
                )
                side_rend_img270 = renderer.render(
                    side_img_bg,
                    frame_verts,
                    cam=frame_cam,
                    angle=270,
                    axis=[0, 1, 0],
                )
                if args.reposed_render:
                    smpl = SMPL('data/vibe_data', batch_size=1)
                    zero_pose = torch.from_numpy(
                        np.zeros((1, pred_pose.shape[-1]))).float()
                    zero_pose[:, 0] = np.pi
                    pred_frame_betas = torch.from_numpy(
                        pred_betas[frame_idx][None, :]).float()
                    with torch.no_grad():
                        reposed_smpl_output = smpl(
                            betas=pred_frame_betas,
                            body_pose=zero_pose[:, 3:],
                            global_orient=zero_pose[:, :3])
                        reposed_verts = reposed_smpl_output.vertices
                        reposed_verts = reposed_verts.cpu().detach().numpy()

                    reposed_cam = np.array([0.9, 0, 0])
                    reposed_rend_img = renderer.render(side_img_bg,
                                                       reposed_verts[0],
                                                       cam=reposed_cam)
                    reposed_rend_img90 = renderer.render(side_img_bg,
                                                         reposed_verts[0],
                                                         cam=reposed_cam,
                                                         angle=90,
                                                         axis=[0, 1, 0])

                    top_row = np.concatenate(
                        [img, reposed_rend_img, reposed_rend_img90], axis=1)
                    bot_row = np.concatenate(
                        [rend_img, side_rend_img90, side_rend_img270], axis=1)
                    whole_img = np.concatenate([top_row, bot_row], axis=0)

                else:
                    top_row = np.concatenate([img, side_img_bg, side_img_bg],
                                             axis=1)
                    bot_row = np.concatenate(
                        [rend_img, side_rend_img90, side_rend_img270], axis=1)
                    whole_img = np.concatenate([top_row, bot_row], axis=0)

            # cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), whole_img)
            cv2.imwrite(
                os.path.join(output_img_folder, os.path.basename(img_fname)),
                whole_img)

        # ========= Save rendered video ========= #
        save_vid_path = os.path.join(output_path, 'vibe_video.mp4')
        print(f'Saving result video to {save_vid_path}')
        images_to_video(img_folder=output_img_folder,
                        output_vid_file=save_vid_path)

    print('================= END =================')