示例#1
0
    def __init__(self,
                 ann_file,
                 img_prefix,
                 data_cfg,
                 pipeline,
                 dataset_info=None,
                 coco_style=True,
                 test_mode=False):

        self.image_info = {}
        self.ann_info = {}

        self.ann_file = ann_file
        self.img_prefix = img_prefix
        self.pipeline = pipeline
        self.test_mode = test_mode

        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
        self.ann_info['num_joints'] = data_cfg['num_joints']

        self.ann_info['inference_channel'] = data_cfg['inference_channel']
        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']

        self.ann_info['use_different_joint_weights'] = data_cfg.get(
            'use_different_joint_weights', False)

        if dataset_info is None:
            raise ValueError(
                'Check https://github.com/open-mmlab/mmpose/pull/663 '
                'for details.')

        dataset_info = DatasetInfo(dataset_info)

        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
        self.ann_info['flip_index'] = dataset_info.flip_index
        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
        self.ann_info['joint_weights'] = dataset_info.joint_weights
        self.ann_info['skeleton'] = dataset_info.skeleton
        self.sigmas = dataset_info.sigmas
        self.dataset_name = dataset_info.dataset_name

        if coco_style:
            self.coco = COCO(ann_file)
            if 'categories' in self.coco.dataset:
                cats = [
                    cat['name']
                    for cat in self.coco.loadCats(self.coco.getCatIds())
                ]
                self.classes = ['__background__'] + cats
                self.num_classes = len(self.classes)
                self._class_to_ind = dict(
                    zip(self.classes, range(self.num_classes)))
                self._class_to_coco_ind = dict(zip(cats,
                                                   self.coco.getCatIds()))
                self._coco_ind_to_class_ind = dict(
                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
                    for cls in self.classes[1:])
            self.img_ids = self.coco.getImgIds()
            self.num_images = len(self.img_ids)
            self.id2name, self.name2id = self._get_mapping_id_name(
                self.coco.imgs)

        self.db = []

        self.pipeline = Compose(self.pipeline)
示例#2
0
def _inference_single_pose_model(model,
                                 img_or_path,
                                 bbox,
                                 dataset,
                                 return_heatmap=False):
    """Inference a single bbox.

    num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str | np.ndarray):Image_name
        bbox (list | np.ndarray): Bounding boxes (with scores),
            shaped (4, ) or (5, ). (left, top, width, height, [score])
        dataset (str): Dataset name.
        outputs (list[str] | tuple[str]): Names of layers whose output is
            to be returned, default: None

    Returns:
        ndarray[Kx3]: Predicted pose x, y, score.
        heatmap[N, K, H, W]: Model output heatmap.
    """

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    assert len(bbox) in [4, 5]
    center, scale = _box2cs(cfg, bbox)

    flip_pairs = None
    if dataset == 'TopDownCocoDataset' or dataset == 'TopDownOCHumanDataset':
        flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                      [13, 14], [15, 16]]
    elif dataset == 'TopDownCocoWholeBodyDataset':
        body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14],
                [15, 16]]
        foot = [[17, 20], [18, 21], [19, 22]]

        face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
                [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
                [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
                [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
                [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]

        hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
                [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
                [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
                [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
                [111, 132]]
        flip_pairs = body + foot + face + hand
    elif dataset == 'TopDownAicDataset':
        flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
    elif (dataset == 'TopDownOneHand10KDataset'
          or dataset == 'TopDownFreiHandDataset'
          or dataset == 'TopDownPanopticDataset'):
        flip_pairs = []
    else:
        raise NotImplementedError()

    # prepare data
    data = {
        'img_or_path':
        img_or_path,
        'center':
        center,
        'scale':
        scale,
        'bbox_score':
        bbox[4] if len(bbox) == 5 else 1,
        'dataset':
        dataset,
        'joints_3d':
        np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
        'joints_3d_visible':
        np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
        'rotation':
        0,
        'ann_info': {
            'image_size': cfg.data_cfg['image_size'],
            'num_joints': cfg.data_cfg['num_joints'],
            'flip_pairs': flip_pairs
        }
    }
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    # forward the model
    with torch.no_grad():
        all_preds, _, _, heatmap = model(return_loss=False,
                                         return_heatmap=return_heatmap,
                                         img=data['img'],
                                         img_metas=data['img_metas'])
    return all_preds[0], heatmap
示例#3
0
def inference_pose_lifter_model(model,
                                pose_results_2d,
                                dataset,
                                with_track_id=True):
    """Inference 3D pose from 2D pose sequences using a pose lifter model.

    Args:
        model (nn.Module): The loaded pose lifter model
        pose_results_2d (List[List[dict]]): The 2D pose sequences stored in a
            nested list. Each element of the outer list is the 2D pose results
            of a single frame, and each element of the inner list is the 2D
            pose of one person, which contains:
                - "keypoints" (ndarray[K, 2 or 3]): x, y, [score]
                - "track_id" (int)
        dataset (str): Dataset name, e.g. 'Body3DH36MDataset'
        with_track_id: If True, the element in pose_results_2d is expected to
            contain "track_id", which will be used to gather the pose sequence
            of a person from multiple frames. Otherwise, the pose results in
            each frame are expected to have a consistent number and order of
            identities. Default is True.
    Returns:
        List[dict]: 3D pose inference results. Each element is the result of
            an instance, which contains:
            - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints
            - "keypoints" (ndarray[K, 2 or 3]): from the last frame in
                ``pose_results_2d``.
            - "track_id" (int): from the last frame in ``pose_results_2d``.
            If there is no valid instance, an empty list will be returned.
    """
    cfg = model.cfg
    test_pipeline = Compose(cfg.test_pipeline)

    flip_pairs = None
    if dataset == 'Body3DH36MDataset':
        flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]]
    else:
        raise NotImplementedError()

    pose_sequences_2d = _collate_pose_sequence(pose_results_2d, with_track_id)

    if not pose_sequences_2d:
        return []

    batch_data = []
    for seq in pose_sequences_2d:
        pose_2d = seq['keypoints'].astype(np.float32)
        T, K, C = pose_2d.shape

        input_2d = pose_2d[..., :2]
        input_2d_visible = pose_2d[..., 2:3]
        if C > 2:
            input_2d_visible = pose_2d[..., 2:3]
        else:
            input_2d_visible = np.ones((T, K, 1), dtype=np.float32)

        # Dummy 3D input
        # This is for compatibility with configs in mmpose<=v0.14.0, where a
        # 3D input is required to generate denormalization parameters. This
        # part will be removed in the future.
        target = np.zeros((K, 3), dtype=np.float32)
        target_visible = np.ones((K, 1), dtype=np.float32)

        # Dummy image path
        # This is for compatibility with configs in mmpose<=v0.14.0, where
        # target_image_path is required. This part will be removed in the
        # future.
        target_image_path = None

        data = {
            'input_2d': input_2d,
            'input_2d_visible': input_2d_visible,
            'target': target,
            'target_visible': target_visible,
            'target_image_path': target_image_path,
            'ann_info': {
                'num_joints': K,
                'flip_pairs': flip_pairs
            }
        }

        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
    if next(model.parameters()).is_cuda:
        device = next(model.parameters()).device
        batch_data = scatter(batch_data, target_gpus=[device.index])[0]
    else:
        batch_data = scatter(batch_data, target_gpus=[-1])[0]

    with torch.no_grad():
        result = model(input=batch_data['input'],
                       metas=batch_data['metas'],
                       return_loss=False)

    poses_3d = result['preds']
    if poses_3d.shape[-1] != 4:
        assert poses_3d.shape[-1] == 3
        dummy_score = np.ones(poses_3d.shape[:-1] + (1, ),
                              dtype=poses_3d.dtype)
        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
    pose_results = []
    for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d):
        pose_result = pose_2d.copy()
        pose_result['keypoints_3d'] = pose_3d
        pose_results.append(pose_result)

    return pose_results
示例#4
0
def inference_bottom_up_pose_model(model,
                                   img_or_path,
                                   return_heatmap=False,
                                   outputs=None):
    """Inference a single image.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str| np.ndarray): Image_name.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[ndarray]: The predicted pose info.
            The length of the list is the number of people (P).
            Each item in the list is a ndarray, containing each person's
            pose (ndarray[Kx3]): x, y, score.
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    pose_results = []
    returned_outputs = []

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    # prepare data
    data = {
        'img_or_path': img_or_path,
        'dataset': 'coco',
        'ann_info': {
            'image_size':
            cfg.data_cfg['image_size'],
            'num_joints':
            cfg.data_cfg['num_joints'],
            'flip_index':
            [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15],
        }
    }

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # forward the model
        with torch.no_grad():
            all_preds, _, _, heatmap = model(img=data['img'],
                                             img_metas=data['img_metas'],
                                             return_loss=False,
                                             return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = heatmap

        returned_outputs.append(h.layer_outputs)

        for pred in all_preds:
            pose_results.append({
                'keypoints': pred[:, :3],
            })

    return pose_results, returned_outputs
示例#5
0
def test_joint_transforms():
    results = get_data_sample()

    mean = np.random.rand(16, 3).astype(np.float32)
    std = np.random.rand(16, 3).astype(np.float32) + 1e-6

    pipeline = [
        dict(type='RelativeJointRandomFlip',
             item='target',
             flip_cfg=dict(center_mode='root', center_index=0),
             visible_item='target_visible',
             flip_prob=1.,
             flip_camera=True),
        dict(type='GetRootCenteredPose',
             item='target',
             root_index=0,
             root_name='global_position',
             remove_root=True),
        dict(type='NormalizeJointCoordinate',
             item='target',
             mean=mean,
             std=std),
        dict(type='PoseSequenceToTensor', item='target'),
        dict(type='ImageCoordinateNormalization',
             item='input_2d',
             norm_camera=True),
        dict(type='CollectCameraIntrinsics'),
        dict(type='Collect',
             keys=[('input_2d', 'input'), ('target', 'output'), 'flip_pairs',
                   'intrinsics'],
             meta_name='metas',
             meta_keys=['camera_param'])
    ]

    pipeline = Compose(pipeline)
    output = pipeline(copy.deepcopy(results))

    # test transformation of target
    joints_0 = results['target']
    joints_1 = output['output'].numpy()
    # manually do transformations
    flip_pairs = output['flip_pairs']
    _joints_0_flipped = joints_0.copy()
    for _l, _r in flip_pairs:
        _joints_0_flipped[..., _l, :] = joints_0[..., _r, :]
        _joints_0_flipped[..., _r, :] = joints_0[..., _l, :]
    _joints_0_flipped[...,
                      0] = 2 * joints_0[..., 0:1, 0] - _joints_0_flipped[...,
                                                                         0]
    joints_0 = _joints_0_flipped
    joints_0 = (joints_0[..., 1:, :] - joints_0[..., 0:1, :] - mean) / std
    # convert to [K*C, T]
    joints_0 = joints_0.reshape(-1)[..., None]
    np.testing.assert_array_almost_equal(joints_0, joints_1)

    # test transformation of input
    joints_0 = results['input_2d']
    joints_1 = output['input']
    # manually do transformations
    center = np.array(
        [0.5 * results['image_width'], 0.5 * results['image_height']],
        dtype=np.float32)
    scale = np.array(0.5 * results['image_width'], dtype=np.float32)
    joints_0 = (joints_0 - center) / scale
    np.testing.assert_array_almost_equal(joints_0, joints_1)

    # test transformation of camera parameters
    camera_param_0 = results['camera_param']
    camera_param_1 = output['metas'].data['camera_param']
    # manually flip and normalization
    camera_param_0['c'][0] *= -1
    camera_param_0['p'][0] *= -1
    camera_param_0['c'] = (camera_param_0['c'] -
                           np.array(center)[:, None]) / scale
    camera_param_0['f'] = camera_param_0['f'] / scale
    np.testing.assert_array_almost_equal(camera_param_0['c'],
                                         camera_param_1['c'])
    np.testing.assert_array_almost_equal(camera_param_0['f'],
                                         camera_param_1['f'])

    # test CollectCameraIntrinsics
    intrinsics_0 = np.concatenate([
        results['camera_param']['f'].reshape(2),
        results['camera_param']['c'].reshape(2),
        results['camera_param']['k'].reshape(3),
        results['camera_param']['p'].reshape(2)
    ])
    intrinsics_1 = output['intrinsics']
    np.testing.assert_array_almost_equal(intrinsics_0, intrinsics_1)

    # test load mean/std from file
    with tempfile.TemporaryDirectory() as tmpdir:
        norm_param = {'mean': mean, 'std': std}
        norm_param_file = osp.join(tmpdir, 'norm_param.pkl')
        mmcv.dump(norm_param, norm_param_file)

        pipeline = [
            dict(type='NormalizeJointCoordinate',
                 item='target',
                 norm_param_file=norm_param_file),
        ]
        pipeline = Compose(pipeline)
示例#6
0
def inference_interhand_3d_model(model,
                                 img_or_path,
                                 det_results,
                                 bbox_thr=None,
                                 format='xywh',
                                 dataset='InterHand3DDataset'):
    """Inference a single image with a list of hand bounding boxes.

    num_bboxes: N
    num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str | np.ndarray): Image filename or loaded image.
        det_results (List[dict]): The 2D bbox sequences stored in a list.
            Each each element of the list is the bbox of one person, which
            contains:
                - "bbox" (ndarray[4 or 5]): The person bounding box,
                which contains 4 box coordinates (and score).
        dataset (str): Dataset name.
        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
            'xyxy' means (left, top, right, bottom),
            'xywh' means (left, top, width, height).

    Returns:
        List[dict]: 3D pose inference results. Each element is the result of
            an instance, which contains:
            - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints
            If there is no valid instance, an empty list will be returned.
    """

    assert format in ['xyxy', 'xywh']

    pose_results = []

    if len(det_results) == 0:
        return pose_results

    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
    bboxes = np.array([box['bbox'] for box in det_results])

    # Select bboxes by score threshold
    if bbox_thr is not None:
        assert bboxes.shape[1] == 5
        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
        bboxes = bboxes[valid_idx]
        det_results = [det_results[i] for i in valid_idx]

    if format == 'xyxy':
        bboxes_xyxy = bboxes
        bboxes_xywh = _xyxy2xywh(bboxes)
    else:
        # format is already 'xywh'
        bboxes_xywh = bboxes
        bboxes_xyxy = _xywh2xyxy(bboxes)

    # if bbox_thr remove all bounding box
    if len(bboxes_xywh) == 0:
        return []

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    assert len(bboxes[0]) in [4, 5]

    if dataset == 'InterHand3DDataset':
        flip_pairs = [[i, 21 + i] for i in range(21)]
    else:
        raise NotImplementedError()

    batch_data = []
    for bbox in bboxes:
        center, scale = _box2cs(cfg, bbox)

        # prepare data
        data = {
            'img_or_path':
            img_or_path,
            'center':
            center,
            'scale':
            scale,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'bbox_id':
            0,  # need to be assigned if batch_size > 1
            'dataset':
            dataset,
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'rotation':
            0,
            'ann_info': {
                'image_size': np.array(cfg.data_cfg['image_size']),
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs,
                'heatmap3d_depth_bound': cfg.data_cfg['heatmap3d_depth_bound'],
                'heatmap_size_root': cfg.data_cfg['heatmap_size_root'],
                'root_depth_bound': cfg.data_cfg['root_depth_bound']
            }
        }

        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=1)

    if next(model.parameters()).is_cuda:
        # scatter not work so just move image to cuda device
        batch_data['img'] = batch_data['img'].to(device)
    # get all img_metas of each bounding box
    batch_data['img_metas'] = [
        img_metas[0] for img_metas in batch_data['img_metas'].data
    ]

    # forward the model
    with torch.no_grad():
        result = model(img=batch_data['img'],
                       img_metas=batch_data['img_metas'],
                       return_loss=False)

    poses_3d = result['preds']
    rel_root_depth = result['rel_root_depth']
    hand_type = result['hand_type']
    if poses_3d.shape[-1] != 4:
        assert poses_3d.shape[-1] == 3
        dummy_score = np.ones(poses_3d.shape[:-1] + (1, ),
                              dtype=poses_3d.dtype)
        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)

    # add relative root depth to left hand joints
    poses_3d[:, 21:, 2] += rel_root_depth

    # set joint scores according to hand type
    poses_3d[:, :21, 3] *= hand_type[:, [0]]
    poses_3d[:, 21:, 3] *= hand_type[:, [1]]

    pose_results = []
    for pose_3d, person_res, bbox_xyxy in zip(poses_3d, det_results,
                                              bboxes_xyxy):
        pose_res = person_res.copy()
        pose_res['keypoints_3d'] = pose_3d
        pose_res['bbox'] = bbox_xyxy
        pose_results.append(pose_res)

    return pose_results
示例#7
0
def inference_bottom_up_pose_model(model,
                                   img_or_path,
                                   pose_nms_thr=0.9,
                                   return_heatmap=False,
                                   outputs=None):
    """Inference a single image.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9.
        return_heatmap (bool) : Flag to return heatmap, default: False.
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None.

    Returns:
        list[ndarray]: The predicted pose info.
            The length of the list is the number of people (P).
            Each item in the list is a ndarray, containing each person's
            pose (ndarray[Kx3]): x, y, score.
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    pose_results = []
    returned_outputs = []

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    # prepare data
    data = {
        'img_or_path': img_or_path,
        'dataset': 'coco',
        'ann_info': {
            'image_size':
            cfg.data_cfg['image_size'],
            'num_joints':
            cfg.data_cfg['num_joints'],
            'flip_index':
            [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15],
        }
    }

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # forward the model
        with torch.no_grad():
            result = model(
                img=data['img'],
                img_metas=data['img_metas'],
                return_loss=False,
                return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = result['output_heatmap']

        returned_outputs.append(h.layer_outputs)

        for idx, pred in enumerate(result['preds']):
            area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * (
                np.max(pred[:, 1]) - np.min(pred[:, 1]))
            pose_results.append({
                'keypoints': pred[:, :3],
                'score': result['scores'][idx],
                'area': area,
            })

        # pose nms
        keep = oks_nms(pose_results, pose_nms_thr, sigmas=None)
        pose_results = [pose_results[_keep] for _keep in keep]

    return pose_results, returned_outputs
示例#8
0
def process_model(model, dataset, person_results, img_or_path):
    bboxes = np.array([box['bbox'] for box in person_results])
    cfg = model.cfg
    flip_pairs = None
    device = next(model.parameters()).device
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)
    if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset',
                   'AnimalMacaqueDataset'):
        flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                      [13, 14], [15, 16]]
    elif dataset == 'TopDownCocoWholeBodyDataset':
        body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14],
                [15, 16]]
        foot = [[17, 20], [18, 21], [19, 22]]

        face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
                [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
                [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
                [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
                [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]

        hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
                [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
                [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
                [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
                [111, 132]]
        flip_pairs = body + foot + face + hand
    elif dataset == 'TopDownAicDataset':
        flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
    elif dataset == 'TopDownMpiiDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
    elif dataset == 'TopDownMpiiTrbDataset':
        flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7],
                      [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23],
                      [29, 35], [18, 24], [30, 36], [19, 25], [31,
                                                               37], [20, 26],
                      [32, 38], [21, 27], [33, 39]]
    elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset',
                     'InterHand2DDataset'):
        flip_pairs = []
    elif dataset == 'Face300WDataset':
        flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
                      [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23],
                      [21, 22], [31, 35], [32, 34], [36, 45], [37,
                                                               44], [38, 43],
                      [39, 42], [40, 47], [41, 46], [48, 54], [49,
                                                               53], [50, 52],
                      [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]]
    elif dataset == 'FaceAFLWDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
                      [12, 14], [15, 17]]

    elif dataset == 'FaceCOFWDataset':
        flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
                      [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]

    elif dataset == 'FaceWFLWDataset':
        flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
                      [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21],
                      [12, 20], [13, 19], [14, 18], [15, 17], [33,
                                                               46], [34, 45],
                      [35, 44], [36, 43], [37, 42], [38, 50], [39,
                                                               49], [40, 48],
                      [41, 47], [60, 72], [61, 71], [62, 70], [63,
                                                               69], [64, 68],
                      [65, 75], [66, 74], [67, 73], [55, 59], [56,
                                                               58], [76, 82],
                      [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
                      [89, 91], [95, 93], [96, 97]]

    elif dataset == 'AnimalFlyDataset':
        flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22],
                      [11, 23], [12, 24], [13, 25], [14, 26], [15, 27],
                      [16, 28], [17, 29], [30, 31]]
    elif dataset == 'AnimalHorse10Dataset':
        flip_pairs = []

    elif dataset == 'AnimalLocustDataset':
        flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25],
                      [11, 26], [12, 27], [13, 28], [14, 29], [15, 30],
                      [16, 31], [17, 32], [18, 33], [19, 34]]

    elif dataset == 'AnimalZebraDataset':
        flip_pairs = [[3, 4], [5, 6]]

    elif dataset == 'AnimalPoseDataset':
        flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15],
                      [16, 17], [18, 19]]
    else:
        raise NotImplementedError()
    batch_data = []
    for bbox in bboxes:
        center, scale = _box2cs(cfg, bbox)
        # prepare data
        data = {
            'img_or_path':
            img_or_path,
            'center':
            center,
            'scale':
            scale,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'bbox_id':
            0,  # need to be assigned if batch_size > 1
            'dataset':
            dataset,
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'rotation':
            0,
            'ann_info': {
                'image_size': np.array(cfg.data_cfg['image_size']),
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs
            }
        }
        data = test_pipeline(data)
        batch_data.append(data)
    batch_data = collate(batch_data, samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter not work so just move image to cuda device
        batch_data['img'] = batch_data['img'].to(device)
    # get all img_metas of each bounding box
    batch_data['img_metas'] = [
        img_metas[0] for img_metas in batch_data['img_metas'].data
    ]
    #torch_data = tr.tensor(input_data)
    #tran = transforms.ToTensor()
    #torch_data = tran(input_data).unsqueeze(0)
    #torch_data = torch_data.to(device)
    #print(batch_data['img'])
    #print(" ")
    #print(input_data)
    #print(" ")
    #print(torch_data - batch_data['img'])
    with tr.no_grad():
        result = model(
            img=batch_data['img'],
            #img = torch_data,
            img_metas=batch_data['img_metas'],
            return_loss=False,
            return_heatmap=False)
    return result['preds'], result['output_heatmap']
示例#9
0
def inference_mesh_model(model,
                         img_or_path,
                         det_results,
                         bbox_thr=None,
                         format='xywh',
                         dataset='MeshH36MDataset'):
    """Inference a single image with a list of bounding boxes.

    Note:
        - num_bboxes: N
        - num_keypoints: K
        - num_vertices: V
        - num_faces: F

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str | np.ndarray): Image filename or loaded image.
        det_results (list[dict]): The 2D bbox sequences stored in a list.
            Each element of the list is the bbox of one person.
            "bbox" (ndarray[4 or 5]): The person bounding box,
            which contains 4 box coordinates (and score).
        bbox_thr (float | None): Threshold for bounding boxes.
            Only bboxes with higher scores will be fed into the pose
            detector. If bbox_thr is None, all boxes will be used.
        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.

            - 'xyxy' means (left, top, right, bottom),
            - 'xywh' means (left, top, width, height).
        dataset (str): Dataset name.

    Returns:
        list[dict]: 3D pose inference results. Each element \
            is the result of an instance, which contains:

            - 'bbox' (ndarray[4]): instance bounding bbox
            - 'center' (ndarray[2]): bbox center
            - 'scale' (ndarray[2]): bbox scale
            - 'keypoints_3d' (ndarray[K,3]): predicted 3D keypoints
            - 'camera' (ndarray[3]): camera parameters
            - 'vertices' (ndarray[V, 3]): predicted 3D vertices
            - 'faces' (ndarray[F, 3]): mesh faces

            If there is no valid instance, an empty list
            will be returned.
    """

    assert format in ['xyxy', 'xywh']

    pose_results = []

    if len(det_results) == 0:
        return pose_results

    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
    bboxes = np.array([box['bbox'] for box in det_results])

    # Select bboxes by score threshold
    if bbox_thr is not None:
        assert bboxes.shape[1] == 5
        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
        bboxes = bboxes[valid_idx]
        det_results = [det_results[i] for i in valid_idx]

    if format == 'xyxy':
        bboxes_xyxy = bboxes
        bboxes_xywh = _xyxy2xywh(bboxes)
    else:
        # format is already 'xywh'
        bboxes_xywh = bboxes
        bboxes_xyxy = _xywh2xyxy(bboxes)

    # if bbox_thr remove all bounding box
    if len(bboxes_xywh) == 0:
        return []

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    assert len(bboxes[0]) in [4, 5]

    if dataset == 'MeshH36MDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
                      [20, 21], [22, 23]]
    else:
        raise NotImplementedError()

    batch_data = []
    for bbox in bboxes:
        center, scale = _box2cs(cfg, bbox)

        # prepare data
        data = {
            'img_or_path':
            img_or_path,
            'center':
            center,
            'scale':
            scale,
            'rotation':
            0,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'dataset':
            dataset,
            'joints_2d':
            np.zeros((cfg.data_cfg.num_joints, 2), dtype=np.float32),
            'joints_2d_visible':
            np.zeros((cfg.data_cfg.num_joints, 1), dtype=np.float32),
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'pose':
            np.zeros(72, dtype=np.float32),
            'beta':
            np.zeros(10, dtype=np.float32),
            'has_smpl':
            0,
            'ann_info': {
                'image_size': np.array(cfg.data_cfg['image_size']),
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs,
            }
        }

        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=1)

    if next(model.parameters()).is_cuda:
        # scatter not work so just move image to cuda device
        batch_data['img'] = batch_data['img'].to(device)
    # get all img_metas of each bounding box
    batch_data['img_metas'] = [
        img_metas[0] for img_metas in batch_data['img_metas'].data
    ]

    # forward the model
    with torch.no_grad():
        preds = model(
            img=batch_data['img'],
            img_metas=batch_data['img_metas'],
            return_loss=False,
            return_vertices=True,
            return_faces=True)

    for idx in range(len(det_results)):
        pose_res = det_results[idx].copy()
        pose_res['bbox'] = bboxes_xyxy[idx]
        pose_res['center'] = batch_data['img_metas'][idx]['center']
        pose_res['scale'] = batch_data['img_metas'][idx]['scale']
        pose_res['keypoints_3d'] = preds['keypoints_3d'][idx]
        pose_res['camera'] = preds['camera'][idx]
        pose_res['vertices'] = preds['vertices'][idx]
        pose_res['faces'] = preds['faces']
        pose_results.append(pose_res)
    return pose_results
示例#10
0
def _inference_single_pose_model(model,
                                 img_or_path,
                                 bboxes,
                                 dataset,
                                 return_heatmap=False):
    """Inference a single bbox.

    num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str | np.ndarray): Image filename or loaded image.
        bboxes (list | np.ndarray): All bounding boxes (with scores),
            shaped (N, 4) or (N, 5). (left, top, width, height, [score])
            where N is number of bounding boxes.
        dataset (str): Dataset name.
        outputs (list[str] | tuple[str]): Names of layers whose output is
            to be returned, default: None

    Returns:
        ndarray[Kx3]: Predicted pose x, y, score.
        heatmap[N, K, H, W]: Model output heatmap.
    """

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    assert len(bboxes[0]) in [4, 5]

    flip_pairs = None
    if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset',
                   'AnimalMacaqueDataset'):
        flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                      [13, 14], [15, 16]]
    elif dataset == 'TopDownCocoWholeBodyDataset':
        body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14],
                [15, 16]]
        foot = [[17, 20], [18, 21], [19, 22]]

        face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
                [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
                [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
                [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
                [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]

        hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
                [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
                [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
                [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
                [111, 132]]
        flip_pairs = body + foot + face + hand
    elif dataset == 'TopDownAicDataset':
        flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
    elif dataset == 'TopDownMpiiDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
    elif dataset == 'TopDownMpiiTrbDataset':
        flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7],
                      [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23],
                      [29, 35], [18, 24], [30, 36], [19, 25], [31,
                                                               37], [20, 26],
                      [32, 38], [21, 27], [33, 39]]
    elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset',
                     'InterHand2DDataset'):
        flip_pairs = []
    elif dataset in 'Face300WDataset':
        flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
                      [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23],
                      [21, 22], [31, 35], [32, 34], [36, 45], [37,
                                                               44], [38, 43],
                      [39, 42], [40, 47], [41, 46], [48, 54], [49,
                                                               53], [50, 52],
                      [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]]

    elif dataset in 'FaceAFLWDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
                      [12, 14], [15, 17]]

    elif dataset in 'FaceCOFWDataset':
        flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
                      [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]

    elif dataset in 'FaceWFLWDataset':
        flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
                      [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21],
                      [12, 20], [13, 19], [14, 18], [15, 17], [33,
                                                               46], [34, 45],
                      [35, 44], [36, 43], [37, 42], [38, 50], [39,
                                                               49], [40, 48],
                      [41, 47], [60, 72], [61, 71], [62, 70], [63,
                                                               69], [64, 68],
                      [65, 75], [66, 74], [67, 73], [55, 59], [56,
                                                               58], [76, 82],
                      [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
                      [89, 91], [95, 93], [96, 97]]

    elif dataset in 'AnimalFlyDataset':
        flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22],
                      [11, 23], [12, 24], [13, 25], [14, 26], [15, 27],
                      [16, 28], [17, 29], [30, 31]]
    elif dataset in 'AnimalHorse10Dataset':
        flip_pairs = []

    elif dataset in 'AnimalLocustDataset':
        flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25],
                      [11, 26], [12, 27], [13, 28], [14, 29], [15, 30],
                      [16, 31], [17, 32], [18, 33], [19, 34]]

    elif dataset in 'AnimalZebraDataset':
        flip_pairs = [[3, 4], [5, 6]]

    elif dataset in 'AnimalPoseDataset':
        flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15],
                      [16, 17], [18, 19]]
    else:
        raise NotImplementedError()

    batch_data = []
    for bbox in bboxes:
        center, scale = _box2cs(cfg, bbox)

        # prepare data
        data = {
            'img_or_path':
            img_or_path,
            'center':
            center,
            'scale':
            scale,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'bbox_id':
            0,  # need to be assigned if batch_size > 1
            'dataset':
            dataset,
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'rotation':
            0,
            'ann_info': {
                'image_size': np.array(cfg.data_cfg['image_size']),
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs
            }
        }
        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=1)

    if next(model.parameters()).is_cuda:
        # scatter not work so just move image to cuda device
        batch_data['img'] = batch_data['img'].to(device)
    # get all img_metas of each bounding box
    batch_data['img_metas'] = [
        img_metas[0] for img_metas in batch_data['img_metas'].data
    ]

    # forward the model
    with torch.no_grad():
        result = model(
            img=batch_data['img'],
            img_metas=batch_data['img_metas'],
            return_loss=False,
            return_heatmap=return_heatmap)

    return result['preds'], result['output_heatmap']
示例#11
0
def inference_pose_lifter_model(model,
                                pose_results_2d,
                                dataset=None,
                                dataset_info=None,
                                with_track_id=True,
                                image_size=None,
                                norm_pose_2d=False):
    """Inference 3D pose from 2D pose sequences using a pose lifter model.

    Args:
        model (nn.Module): The loaded pose lifter model
        pose_results_2d (list[list[dict]]): The 2D pose sequences stored in a
            nested list. Each element of the outer list is the 2D pose results
            of a single frame, and each element of the inner list is the 2D
            pose of one person, which contains:

            - "keypoints" (ndarray[K, 2 or 3]): x, y, [score]
            - "track_id" (int)
        dataset (str): Dataset name, e.g. 'Body3DH36MDataset'
        with_track_id: If True, the element in pose_results_2d is expected to
            contain "track_id", which will be used to gather the pose sequence
            of a person from multiple frames. Otherwise, the pose results in
            each frame are expected to have a consistent number and order of
            identities. Default is True.
        image_size (tuple|list): image width, image height. If None, image size
            will not be contained in dict ``data``.
        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
            pose) to the average bbox scale of the dataset, and move the bbox
            (along with the 2D pose) to the average bbox center of the dataset.

    Returns:
        list[dict]: 3D pose inference results. Each element is the result of \
            an instance, which contains:

            - "keypoints_3d" (ndarray[K, 3]): predicted 3D keypoints
            - "keypoints" (ndarray[K, 2 or 3]): from the last frame in \
                ``pose_results_2d``.
            - "track_id" (int): from the last frame in ``pose_results_2d``. \
                If there is no valid instance, an empty list will be \
                returned.
    """
    cfg = model.cfg
    test_pipeline = Compose(cfg.test_pipeline)

    if dataset_info is not None:
        flip_pairs = dataset_info.flip_pairs
        assert 'stats_info' in dataset_info._dataset_info
        bbox_center = dataset_info._dataset_info['stats_info']['bbox_center']
        bbox_scale = dataset_info._dataset_info['stats_info']['bbox_scale']
    else:
        warnings.warn(
            'dataset is deprecated.'
            'Please set `dataset_info` in the config.'
            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
            DeprecationWarning)
        # TODO: These will be removed in the later versions.
        if dataset == 'Body3DH36MDataset':
            flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]]
            bbox_center = np.array([[528, 427]], dtype=np.float32)
            bbox_scale = 400
        else:
            raise NotImplementedError()

    target_idx = -1 if model.causal else len(pose_results_2d) // 2
    pose_lifter_inputs = _gather_pose_lifter_inputs(pose_results_2d,
                                                    bbox_center, bbox_scale,
                                                    norm_pose_2d)
    pose_sequences_2d = _collate_pose_sequence(pose_lifter_inputs,
                                               with_track_id, target_idx)

    if not pose_sequences_2d:
        return []

    batch_data = []
    for seq in pose_sequences_2d:
        pose_2d = seq['keypoints'].astype(np.float32)
        T, K, C = pose_2d.shape

        input_2d = pose_2d[..., :2]
        input_2d_visible = pose_2d[..., 2:3]
        if C > 2:
            input_2d_visible = pose_2d[..., 2:3]
        else:
            input_2d_visible = np.ones((T, K, 1), dtype=np.float32)

        # TODO: Will be removed in the later versions
        # Dummy 3D input
        # This is for compatibility with configs in mmpose<=v0.14.0, where a
        # 3D input is required to generate denormalization parameters. This
        # part will be removed in the future.
        target = np.zeros((K, 3), dtype=np.float32)
        target_visible = np.ones((K, 1), dtype=np.float32)

        # Dummy image path
        # This is for compatibility with configs in mmpose<=v0.14.0, where
        # target_image_path is required. This part will be removed in the
        # future.
        target_image_path = None

        data = {
            'input_2d': input_2d,
            'input_2d_visible': input_2d_visible,
            'target': target,
            'target_visible': target_visible,
            'target_image_path': target_image_path,
            'ann_info': {
                'num_joints': K,
                'flip_pairs': flip_pairs
            }
        }

        if image_size is not None:
            assert len(image_size) == 2
            data['image_width'] = image_size[0]
            data['image_height'] = image_size[1]

        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
    if next(model.parameters()).is_cuda:
        device = next(model.parameters()).device
        batch_data = scatter(batch_data, target_gpus=[device.index])[0]
    else:
        batch_data = scatter(batch_data, target_gpus=[-1])[0]

    with torch.no_grad():
        result = model(
            input=batch_data['input'],
            metas=batch_data['metas'],
            return_loss=False)

    poses_3d = result['preds']
    if poses_3d.shape[-1] != 4:
        assert poses_3d.shape[-1] == 3
        dummy_score = np.ones(
            poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype)
        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
    pose_results = []
    for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d):
        pose_result = pose_2d.copy()
        pose_result['keypoints_3d'] = pose_3d
        pose_results.append(pose_result)

    return pose_results
示例#12
0
def _inference_single_pose_model(model,
                                 imgs_or_paths,
                                 bboxes,
                                 dataset='TopDownCocoDataset',
                                 dataset_info=None,
                                 return_heatmap=False,
                                 use_multi_frames=False):
    """Inference human bounding boxes.

    Note:
        - num_frames: F
        - num_bboxes: N
        - num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        imgs_or_paths (list(str) | list(np.ndarray)): Image filename(s) or
            loaded image(s)
        bboxes (list | np.ndarray): All bounding boxes (with scores),
            shaped (N, 4) or (N, 5). (left, top, width, height, [score])
            where N is number of bounding boxes.
        dataset (str): Dataset name. Deprecated.
        dataset_info (DatasetInfo): A class containing all dataset info.
        return_heatmap (bool): Flag to return heatmap, default: False
        use_multi_frames (bool): Flag to use multi frames for inference

    Returns:
        ndarray[NxKx3]: Predicted pose x, y, score.
        heatmap[N, K, H, W]: Model output heatmap.
    """

    cfg = model.cfg
    device = next(model.parameters()).device
    if device.type == 'cpu':
        device = -1

    if use_multi_frames:
        assert 'frame_weight_test' in cfg.data.test.data_cfg
        # use multi frames for inference
        # the number of input frames must equal to frame weight in the config
        assert len(imgs_or_paths) == len(
            cfg.data.test.data_cfg.frame_weight_test)

    # build the data pipeline
    _test_pipeline = copy.deepcopy(cfg.test_pipeline)

    has_bbox_xywh2cs = False
    for transform in _test_pipeline:
        if transform['type'] == 'TopDownGetBboxCenterScale':
            has_bbox_xywh2cs = True
            break
    if not has_bbox_xywh2cs:
        _test_pipeline.insert(
            0, dict(type='TopDownGetBboxCenterScale', padding=1.25))
    test_pipeline = Compose(_test_pipeline)
    _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device)

    assert len(bboxes[0]) in [4, 5]

    if dataset_info is not None:
        dataset_name = dataset_info.dataset_name
        flip_pairs = dataset_info.flip_pairs
    else:
        warnings.warn(
            'dataset is deprecated.'
            'Please set `dataset_info` in the config.'
            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
            DeprecationWarning)
        # TODO: These will be removed in the later versions.
        if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset',
                       'AnimalMacaqueDataset'):
            flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                          [13, 14], [15, 16]]
        elif dataset == 'TopDownCocoWholeBodyDataset':
            body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                    [13, 14], [15, 16]]
            foot = [[17, 20], [18, 21], [19, 22]]

            face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
                    [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
                    [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
                    [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
                    [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]

            hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
                    [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
                    [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
                    [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
                    [111, 132]]
            flip_pairs = body + foot + face + hand
        elif dataset == 'TopDownAicDataset':
            flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
        elif dataset == 'TopDownMpiiDataset':
            flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
        elif dataset == 'TopDownMpiiTrbDataset':
            flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11],
                          [14, 15], [16, 22], [28, 34], [17, 23], [29, 35],
                          [18, 24], [30, 36], [19, 25], [31, 37], [20, 26],
                          [32, 38], [21, 27], [33, 39]]
        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
                         'PanopticDataset', 'InterHand2DDataset'):
            flip_pairs = []
        elif dataset in 'Face300WDataset':
            flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
                          [6, 10], [7, 9], [17, 26], [18, 25], [19, 24],
                          [20, 23], [21, 22], [31, 35], [32, 34], [36, 45],
                          [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
                          [48, 54], [49, 53], [50, 52], [61, 63], [60, 64],
                          [67, 65], [58, 56], [59, 55]]

        elif dataset in 'FaceAFLWDataset':
            flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
                          [12, 14], [15, 17]]

        elif dataset in 'FaceCOFWDataset':
            flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
                          [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]

        elif dataset in 'FaceWFLWDataset':
            flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
                          [6, 26], [7, 25], [8, 24], [9, 23], [10, 22],
                          [11, 21], [12, 20], [13, 19], [14, 18], [15, 17],
                          [33, 46], [34, 45], [35, 44], [36, 43], [37, 42],
                          [38, 50], [39, 49], [40, 48], [41, 47], [60, 72],
                          [61, 71], [62, 70], [63, 69], [64, 68], [65, 75],
                          [66, 74], [67, 73], [55, 59], [56, 58], [76, 82],
                          [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
                          [89, 91], [95, 93], [96, 97]]

        elif dataset in 'AnimalFlyDataset':
            flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22],
                          [11, 23], [12, 24], [13, 25], [14, 26], [15, 27],
                          [16, 28], [17, 29], [30, 31]]
        elif dataset in 'AnimalHorse10Dataset':
            flip_pairs = []

        elif dataset in 'AnimalLocustDataset':
            flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24],
                          [10, 25], [11, 26], [12, 27], [13, 28], [14, 29],
                          [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]

        elif dataset in 'AnimalZebraDataset':
            flip_pairs = [[3, 4], [5, 6]]

        elif dataset in 'AnimalPoseDataset':
            flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15],
                          [16, 17], [18, 19]]
        else:
            raise NotImplementedError()
        dataset_name = dataset

    batch_data = []
    for bbox in bboxes:
        # prepare data
        data = {
            'bbox':
            bbox,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'bbox_id':
            0,  # need to be assigned if batch_size > 1
            'dataset':
            dataset_name,
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'rotation':
            0,
            'ann_info': {
                'image_size': np.array(cfg.data_cfg['image_size']),
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs
            }
        }

        if use_multi_frames:
            # weight for different frames in multi-frame inference setting
            data['frame_weight'] = cfg.data.test.data_cfg.frame_weight_test
            if isinstance(imgs_or_paths[0], np.ndarray):
                data['img'] = imgs_or_paths
            else:
                data['image_file'] = imgs_or_paths
        else:
            if isinstance(imgs_or_paths, np.ndarray):
                data['img'] = imgs_or_paths
            else:
                data['image_file'] = imgs_or_paths

        data = test_pipeline(data)
        batch_data.append(data)

    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
    batch_data = scatter(batch_data, [device])[0]

    # forward the model
    with torch.no_grad():
        result = model(img=batch_data['img'],
                       img_metas=batch_data['img_metas'],
                       return_loss=False,
                       return_heatmap=return_heatmap)

    return result['preds'], result['output_heatmap']
示例#13
0
def inference_bottom_up_pose_model(model,
                                   img_or_path,
                                   dataset='BottomUpCocoDataset',
                                   dataset_info=None,
                                   pose_nms_thr=0.9,
                                   return_heatmap=False,
                                   outputs=None):
    """Inference a single image with a bottom-up pose model.

    Note:
        - num_people: P
        - num_keypoints: K
        - bbox height: H
        - bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'.
            It is deprecated. Please use dataset_info instead.
        dataset_info (DatasetInfo): A class containing all dataset info.
        pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9.
        return_heatmap (bool) : Flag to return heatmap, default: False.
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None.

    Returns:
        tuple:
        - pose_results (list[np.ndarray]): The predicted pose info. \
            The length of the list is the number of people (P). \
            Each item in the list is a ndarray, containing each \
            person's pose (np.ndarray[Kx3]): x, y, score.
        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
            torch.Tensor[N, K, H, W]]]): \
            Output feature maps from layers specified in `outputs`. \
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # get dataset info
    if (dataset_info is None and hasattr(model, 'cfg')
            and 'dataset_info' in model.cfg):
        dataset_info = DatasetInfo(model.cfg.dataset_info)

    if dataset_info is not None:
        dataset_name = dataset_info.dataset_name
        flip_index = dataset_info.flip_index
        sigmas = getattr(dataset_info, 'sigmas', None)
    else:
        warnings.warn(
            'dataset is deprecated.'
            'Please set `dataset_info` in the config.'
            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
            DeprecationWarning)
        assert (dataset == 'BottomUpCocoDataset')
        dataset_name = dataset
        flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
        sigmas = None

    pose_results = []
    returned_outputs = []

    cfg = model.cfg
    device = next(model.parameters()).device
    if device.type == 'cpu':
        device = -1

    # build the data pipeline
    test_pipeline = Compose(cfg.test_pipeline)
    _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device)

    # prepare data
    data = {
        'dataset': dataset_name,
        'ann_info': {
            'image_size': np.array(cfg.data_cfg['image_size']),
            'num_joints': cfg.data_cfg['num_joints'],
            'flip_index': flip_index,
        }
    }
    if isinstance(img_or_path, np.ndarray):
        data['img'] = img_or_path
    else:
        data['image_file'] = img_or_path

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    data = scatter(data, [device])[0]

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # forward the model
        with torch.no_grad():
            result = model(img=data['img'],
                           img_metas=data['img_metas'],
                           return_loss=False,
                           return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = result['output_heatmap']

        returned_outputs.append(h.layer_outputs)

        for idx, pred in enumerate(result['preds']):
            area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * (
                np.max(pred[:, 1]) - np.min(pred[:, 1]))
            pose_results.append({
                'keypoints': pred[:, :3],
                'score': result['scores'][idx],
                'area': area,
            })

        # pose nms
        score_per_joint = cfg.model.test_cfg.get('score_per_joint', False)
        keep = oks_nms(pose_results,
                       pose_nms_thr,
                       sigmas,
                       score_per_joint=score_per_joint)
        pose_results = [pose_results[_keep] for _keep in keep]

    return pose_results, returned_outputs
示例#14
0
def _inference_single_pose_model(model, img_or_path, bbox):
    """Inference a single bbox.

    num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str | np.ndarray):Image_name
        bbox (list | np.ndarray): Bounding boxes (with scores),
            shaped (4, ) or (5, ). (left, top, width, height, [score])

    Returns:
        ndarray[Kx3]: Predicted pose x, y, score.
    """
    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    assert len(bbox) in [4, 5]
    center, scale = _box2cs(cfg, bbox)
    # prepare data
    data = {
        'img_or_path':
        img_or_path,
        'center':
        center,
        'scale':
        scale,
        'bbox_score':
        bbox[4] if len(bbox) == 5 else 1,
        'dataset':
        'coco',
        'joints_3d':
        np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
        'joints_3d_visible':
        np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
        'rotation':
        0,
        'ann_info': {
            'image_size':
            cfg.data_cfg['image_size'],
            'num_joints':
            cfg.data_cfg['num_joints'],
            'flip_pairs': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                           [13, 14], [15, 16]]
        }
    }
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    # forward the model
    with torch.no_grad():
        all_preds, _, _ = model(return_loss=False,
                                img=data['img'],
                                img_metas=data['img_metas'])

    return all_preds[0]
示例#15
0
def test_camera_projection():
    results = get_data_sample()
    pipeline_1 = [
        dict(type='CameraProjection',
             item='input_3d',
             output_name='input_3d_w',
             camera_type='SimpleCamera',
             mode='camera_to_world'),
        dict(type='CameraProjection',
             item='input_3d_w',
             output_name='input_3d_wp',
             camera_type='SimpleCamera',
             mode='world_to_pixel'),
        dict(type='CameraProjection',
             item='input_3d',
             output_name='input_3d_p',
             camera_type='SimpleCamera',
             mode='camera_to_pixel'),
        dict(type='Collect', keys=['input_3d_wp', 'input_3d_p'], meta_keys=[])
    ]
    camera_param = results['camera_param'].copy()
    camera_param['K'] = np.concatenate(
        (np.diagflat(camera_param['f']), camera_param['c']), axis=-1)
    pipeline_2 = [
        dict(type='CameraProjection',
             item='input_3d',
             output_name='input_3d_w',
             camera_type='SimpleCamera',
             camera_param=camera_param,
             mode='camera_to_world'),
        dict(type='CameraProjection',
             item='input_3d_w',
             output_name='input_3d_wp',
             camera_type='SimpleCamera',
             camera_param=camera_param,
             mode='world_to_pixel'),
        dict(type='CameraProjection',
             item='input_3d',
             output_name='input_3d_p',
             camera_type='SimpleCamera',
             camera_param=camera_param,
             mode='camera_to_pixel'),
        dict(type='CameraProjection',
             item='input_3d_w',
             output_name='input_3d_wc',
             camera_type='SimpleCamera',
             camera_param=camera_param,
             mode='world_to_camera'),
        dict(type='Collect',
             keys=['input_3d_wp', 'input_3d_p', 'input_2d'],
             meta_keys=[])
    ]

    output1 = Compose(pipeline_1)(results)
    output2 = Compose(pipeline_2)(results)

    np.testing.assert_allclose(output1['input_3d_wp'],
                               output1['input_3d_p'],
                               rtol=1e-6)

    np.testing.assert_allclose(output2['input_3d_wp'],
                               output2['input_3d_p'],
                               rtol=1e-6)

    np.testing.assert_allclose(output2['input_3d_p'],
                               output2['input_2d'],
                               rtol=1e-3,
                               atol=1e-1)

    # test invalid camera parameters
    with pytest.raises(ValueError):
        # missing intrinsic parameters
        camera_param_wo_intrinsic = camera_param.copy()
        camera_param_wo_intrinsic.pop('K')
        camera_param_wo_intrinsic.pop('f')
        camera_param_wo_intrinsic.pop('c')
        _ = Compose([
            dict(type='CameraProjection',
                 item='input_3d',
                 camera_type='SimpleCamera',
                 camera_param=camera_param_wo_intrinsic,
                 mode='camera_to_pixel')
        ])

    with pytest.raises(ValueError):
        # invalid mode
        _ = Compose([
            dict(type='CameraProjection',
                 item='input_3d',
                 camera_type='SimpleCamera',
                 camera_param=camera_param,
                 mode='dummy')
        ])

    # test camera without undistortion
    camera_param_wo_undistortion = camera_param.copy()
    camera_param_wo_undistortion.pop('k')
    camera_param_wo_undistortion.pop('p')
    _ = Compose([
        dict(type='CameraProjection',
             item='input_3d',
             camera_type='SimpleCamera',
             camera_param=camera_param_wo_undistortion,
             mode='camera_to_pixel')
    ])

    # test pixel to camera transformation
    camera = SimpleCamera(camera_param_wo_undistortion)
    kpt_camera = np.random.rand(14, 3)
    kpt_pixel = camera.camera_to_pixel(kpt_camera)
    _kpt_camera = camera.pixel_to_camera(
        np.concatenate([kpt_pixel, kpt_camera[:, [2]]], -1))
    assert_array_almost_equal(_kpt_camera, kpt_camera, decimal=4)
示例#16
0
def inference_bottom_up_pose_model(model, img_or_path):
    """Inference a single image.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str| np.ndarray): Image_name.

    Returns:
        list[ndarray]: The predicted pose info.

            The length of the list
            is the number of people (P). Each item in the
            list is a ndarray, containing each person's
            pose (ndarray[Kx3]): x, y, score
    """
    pose_results = []
    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    # prepare data
    data = {
        'img_or_path': img_or_path,
        'dataset': 'coco',
        'ann_info': {
            'image_size':
            cfg.data_cfg['image_size'],
            'num_joints':
            cfg.data_cfg['num_joints'],
            'flip_index':
            [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15],
        }
    }

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    # forward the model
    with torch.no_grad():
        all_preds, _, _ = model(return_loss=False,
                                img=data['img'],
                                img_metas=data['img_metas'])

    for pred in all_preds:
        pose_results.append({
            'keypoints': pred[:, :3],
        })

    return pose_results
示例#17
0
def _inference_single_pose_model(model,
                                 img_or_path_video,
                                 bbox_video,
                                 dataset,
                                 return_heatmap=False):
    """Inference a single bbox.

    num_keypoints: K

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str | np.ndarray):Image_name
        bbox (list | np.ndarray): Bounding boxes (with scores),
            shaped (4, ) or (5, ). (left, top, width, height, [score])
        dataset (str): Dataset name.
        outputs (list[str] | tuple[str]): Names of layers whose output is
            to be returned, default: None

    Returns:
        ndarray[Kx3]: Predicted pose x, y, score.
        heatmap[N, K, H, W]: Model output heatmap.
    """

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    flip_pairs = None
    if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset'):
        flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                      [13, 14], [15, 16]]
    elif dataset == 'TopDownCocoWholeBodyDataset':
        body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14],
                [15, 16]]
        foot = [[17, 20], [18, 21], [19, 22]]

        face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
                [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
                [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
                [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
                [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]

        hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
                [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
                [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
                [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
                [111, 132]]
        flip_pairs = body + foot + face + hand
    elif dataset == 'TopDownAicDataset':
        flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
    elif dataset == 'TopDownMpiiDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
    elif dataset == 'TopDownMpiiTrbDataset':
        flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7],
                      [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23],
                      [29, 35], [18, 24], [30, 36], [19, 25], [31,
                                                               37], [20, 26],
                      [32, 38], [21, 27], [33, 39]]
    elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset',
                     'InterHand2DDataset'):
        flip_pairs = []
    elif dataset in 'Face300WDataset':
        flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
                      [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23],
                      [21, 22], [31, 35], [32, 34], [36, 45], [37,
                                                               44], [38, 43],
                      [39, 42], [40, 47], [41, 46], [48, 54], [49,
                                                               53], [50, 52],
                      [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]]

    elif dataset in 'FaceAFLWDataset':
        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
                      [12, 14], [15, 17]]

    elif dataset in 'FaceCOFWDataset':
        flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
                      [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]

    elif dataset in 'FaceWFLWDataset':
        flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
                      [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21],
                      [12, 20], [13, 19], [14, 18], [15, 17], [33,
                                                               46], [34, 45],
                      [35, 44], [36, 43], [37, 42], [38, 50], [39,
                                                               49], [40, 48],
                      [41, 47], [60, 72], [61, 71], [62, 70], [63,
                                                               69], [64, 68],
                      [65, 75], [66, 74], [67, 73], [55, 59], [56,
                                                               58], [76, 82],
                      [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
                      [89, 91], [95, 93], [96, 97]]
    else:
        raise NotImplementedError()

    # prepare data
    img_list = []
    img_metas_list = []
    for i in range(len(img_or_path_video)):
        bbox = bbox_video[i]
        img_or_path = img_or_path_video[i]

        assert len(bbox) in [4, 5]
        center, scale = _box2cs(cfg, bbox)

        data = {
            'img_or_path':
            img_or_path,
            'center':
            center,
            'scale':
            scale,
            'bbox_score':
            bbox[4] if len(bbox) == 5 else 1,
            'dataset':
            dataset,
            'joints_3d':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'joints_3d_visible':
            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
            'rotation':
            0,
            'ann_info': {
                'image_size': cfg.data_cfg['image_size'],
                'num_joints': cfg.data_cfg['num_joints'],
                'flip_pairs': flip_pairs
            }
        }
        data = test_pipeline(data)
        data = collate([data], samples_per_gpu=1)
        if next(model.parameters()).is_cuda:
            # scatter to specified GPU
            data = scatter(data, [device])[0]
        else:
            # just get the actual data from DataContainer
            data['img_metas'] = data['img_metas'].data[0]
        data['img_metas'][0]['bbox_id'] = 0
        img_list.append(data['img'])
        img_metas_list += data['img_metas']

    # forward the model
    with torch.no_grad():
        result = model(img=torch.cat(img_list, dim=0),
                       img_metas=img_metas_list,
                       return_loss=False,
                       return_heatmap=return_heatmap)

    return result['preds'], result['output_heatmap']