Exemplo n.º 1
0
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        with h5py.File(self.h5_file, 'r') as db:
            self.db = db

            kp_2d = self.get_sequence(start_index, end_index,
                                      self.db['joints2D'])
            kp_2d = convert_kps(kp_2d, src='insta', dst='spin')
            kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)

            input = torch.from_numpy(
                self.get_sequence(start_index, end_index,
                                  self.db['features'])).float()

            vid_name = self.get_sequence(start_index, end_index,
                                         self.db['vid_name'])
            frame_id = self.get_sequence(start_index, end_index,
                                         self.db['frame_id']).astype(str)
            instance_id = np.array(
                [v.decode('ascii') + f for v, f in zip(vid_name, frame_id)])

        for idx in range(self.seqlen):
            kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224)
            kp_2d_tensor[idx] = kp_2d[idx]

        repeat_num = 3
        target = {
            'features':
            input,
            'kp_2d':
            torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat(
                repeat_num, 1,
                1),  # 2D keypoints transformed according to bbox cropping
            # 'instance_id': instance_id
        }

        return target
Exemplo n.º 2
0
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D'])
        if self.dataset_name != 'posetrack':
            kp_2d = convert_kps(kp_2d, src=self.dataset_name, dst='spin')
        kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)

        bbox = self.get_sequence(start_index, end_index, self.db['bbox'])

        input = torch.from_numpy(
            self.get_sequence(start_index, end_index,
                              self.db['features'])).float()

        for idx in range(self.seqlen):
            # crop image and transform 2d keypoints
            kp_2d[idx, :, :2], trans = transfrom_keypoints(
                kp_2d=kp_2d[idx, :, :2],
                center_x=bbox[idx, 0],
                center_y=bbox[idx, 1],
                width=bbox[idx, 2],
                height=bbox[idx, 3],
                patch_width=224,
                patch_height=224,
                do_augment=False,
            )

            kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224)
            kp_2d_tensor[idx] = kp_2d[idx]

        vid_name = self.get_sequence(start_index, end_index,
                                     self.db['vid_name'])
        frame_id = self.get_sequence(start_index, end_index,
                                     self.db['img_name']).astype(str)
        instance_id = np.array([v + f for v, f in zip(vid_name, frame_id)])

        bbox = self.get_sequence(start_index, end_index, self.db['bbox'])
        # video = torch.cat(
        #     [get_single_image_crop(image, None, bbox, scale=1.2).unsqueeze(0) for idx, (image, bbox) in
        #      enumerate(zip(frame_id, bbox))], dim=0
        # )

        repeat_num = 3
        target = {
            'features':
            input,
            'kp_2d':
            torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat(
                repeat_num, 1,
                1),  # 2D keypoints transformed according to bbox cropping
            # 'instance_id': instance_id,
        }

        if self.debug:

            vid_name = self.db['vid_name'][start_index]

            if self.dataset_name == 'pennaction':
                vid_folder = "frames"
                vid_name = vid_name.split('/')[-1].split('.')[0]
                img_id = "img_name"
            elif self.dataset_name == 'posetrack':
                vid_folder = osp.join('images', vid_name.split('/')[-2])
                vid_name = vid_name.split('/')[-1].split('.')[0]
                img_id = "img_name"
            else:
                vid_name = '_'.join(vid_name.split('_')[:-1])
                vid_folder = 'imageFiles'
                img_id = 'frame_id'
            f = osp.join(self.folder, vid_folder, vid_name)
            video_file_list = [
                osp.join(f, x) for x in sorted(os.listdir(f))
                if x.endswith('.jpg')
            ]
            frame_idxs = self.get_sequence(start_index, end_index,
                                           self.db[img_id])
            if self.dataset_name == 'pennaction' or self.dataset_name == 'posetrack':
                video = frame_idxs
            else:
                video = [video_file_list[i] for i in frame_idxs]

            video = torch.cat([
                get_single_image_crop(image, bbox).unsqueeze(0)
                for image, bbox in zip(video, bbox)
            ],
                              dim=0)

            target['video'] = video

        return target
def read_data_train(dataset_path, set='train', debug=False):
    dataset = {
        'vid_name': [],
        'frame_id': [],
        'joints3D': [],
        'joints2D': [],
        'shape': [],
        'pose': [],
        'bbox': [],
        'img_name': [],
        'features': [],
    }

    # occluders = load_occluders('./data/VOC2012')

    model = spin.get_pretrained_hmr()

    if set == 'train':
        subjects = [1, 5, 6, 7, 8]
    else:
        subjects = [9, 11]
    for subject in subjects:
        annot_path = osp.join(dataset_path, 'annotations')
        # camera load
        with open(
                osp.join(annot_path,
                         'Human36M_subject' + str(subject) + '_camera.json'),
                'r') as f:
            cameras = json.load(f)
        # joint coordinate load
        with open(
                osp.join(annot_path,
                         'Human36M_subject' + str(subject) + '_joint_3d.json'),
                'r') as f:
            joints = json.load(f)
        # SMPL parameters obtained by NeuralAnnot will be released (https://arxiv.org/abs/2011.11232) after publication
        # # smpl parameter load
        # with open(osp.join(annot_path, 'Human36M_subject' + str(subject) + '_SMPL_NeuralAnnot.json'), 'r') as f:
        #     smpl_params = json.load(f)

        seq_list = sorted(glob.glob(dataset_path +
                                    f'/images/s_{subject:02d}*'))
        for seq in tqdm(seq_list):
            seq_name = seq.split('/')[-1]
            act = str(int(seq_name.split('_act_')[-1][0:2]))
            subact = str(int(seq_name.split('_subact_')[-1][0:2]))
            cam = str(int(seq_name.split('_ca_')[-1][0:2]))
            # if cam != '4':  # front camera (Table 6)
            #     continue
            print("seq name: ", seq)

            img_paths = sorted(glob.glob(seq + '/*.jpg'))
            num_frames = len(img_paths)
            if num_frames < 1:
                continue
            # camera parameter
            cam_param = cameras[cam]
            R, t, f, c = np.array(cam_param['R'], dtype=np.float32), np.array(
                cam_param['t'], dtype=np.float32), np.array(
                    cam_param['f'],
                    dtype=np.float32), np.array(cam_param['c'],
                                                dtype=np.float32)

            # img starts from index 1, and annot starts from index 0
            poses = np.zeros((num_frames, 72), dtype=np.float32)
            shapes = np.zeros((num_frames, 10), dtype=np.float32)
            j3ds = np.zeros((num_frames, 49, 3), dtype=np.float32)
            j2ds = np.zeros((num_frames, 49, 3), dtype=np.float32)

            for img_i in tqdm(range(num_frames)):
                # smpl_param = smpl_params[act][subact][str(img_i)][cam]
                # pose = np.array(smpl_param['pose'], dtype=np.float32)
                # shape = np.array(smpl_param['shape'], dtype=np.float32)

                joint_world = np.array(joints[act][subact][str(img_i)],
                                       dtype=np.float32)
                # match right, left
                match = [[1, 4], [2, 5], [3, 6]]
                for m in match:
                    l, r = m
                    joint_world[l], joint_world[r] = joint_world[r].copy(
                    ), joint_world[l].copy()
                joint_cam = world2cam(joint_world, R, t)
                joint_img = cam2pixel(joint_cam, f, c)

                j3d = convert_kps(joint_cam[None, :, :] / 1000, "h36m",
                                  "spin").reshape((-1, 3))
                j3d = j3d - j3d[39]  # 4 is the root

                joint_img[:, 2] = 1
                j2d = convert_kps(joint_img[None, :, :], "h36m",
                                  "spin").reshape((-1, 3))

                # poses[img_i] = pose
                # shapes[img_i] = shape
                j3ds[img_i] = j3d
                j2ds[img_i] = j2d
                """
                import torch
                smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False)
    
                p = torch.from_numpy(pose).float().reshape(1,-1,3)
                s = torch.from_numpy(shape).float().reshape(1,-1)
                J_regressor = torch.from_numpy(np.load(osp.join(TCMR_DATA_DIR, 'J_regressor_h36m.npy'))).float()
                output = smpl(betas=s, body_pose=p[:, 3:], global_orient=p[:, :3])
                vertices = output.vertices
                J_regressor_batch = J_regressor[None, :].expand(vertices.shape[0], -1, -1).to(vertices.device)
                temp_j3d = torch.matmul(J_regressor_batch, vertices) * 1000
                # temp_j3d = temp_j3d - temp_j3d[:, 0, :]
                temp_j3d = temp_j3d[0, H36M_TO_J14, :]
    
                gt_j3d = joint_cam - joint_cam[0, :]
                gt_j3d = gt_j3d[H36M_TO_J14, :]
    
                print("CHECK: ", (temp_j3d-gt_j3d))
                """

            bbox_params, time_pt1, time_pt2 = get_smooth_bbox_params(
                j2ds, vis_thresh=VIS_THRESH, sigma=8)
            # bbox_params, time_pt1, time_pt2 = get_all_bbox_params(j2ds, vis_thresh=VIS_THRESH)
            """
            img = cv2.imread(img_paths[0])
            temp = draw_skeleton(img, j2ds[0], dataset='spin', unnormalize=False, thickness=2)
            cv2.imshow('img', temp)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
            cv2.waitKey(1)
            """

            # process bbox_params
            c_x = bbox_params[:, 0]
            c_y = bbox_params[:, 1]
            scale = bbox_params[:, 2]

            w = h = 150. / scale
            w = h = h * 0.9  # 1.1 for h36m_train_25fps_occ_db.pt
            bbox = np.vstack([c_x, c_y, w, h]).T

            img_paths_array = np.array(img_paths)[time_pt1:time_pt2][::2]
            bbox = bbox[::2]
            # subsample frame to 25 fps

            dataset['vid_name'].append(
                np.array([f'{seq}_{subject}'] *
                         num_frames)[time_pt1:time_pt2][::2])
            dataset['frame_id'].append(
                np.arange(0, num_frames)[time_pt1:time_pt2][::2])
            dataset['joints3D'].append(j3ds[time_pt1:time_pt2][::2])
            dataset['joints2D'].append(j2ds[time_pt1:time_pt2][::2])
            dataset['shape'].append(shapes[time_pt1:time_pt2][::2])
            dataset['pose'].append(poses[time_pt1:time_pt2][::2])

            dataset['img_name'].append(img_paths_array)
            dataset['bbox'].append(bbox)

            features = extract_features(
                model,
                None,
                img_paths_array,
                bbox,
                kp_2d=j2ds[time_pt1:time_pt2][::2],
                debug=debug,
                dataset='h36m',
                scale=1.0)  # 1.2 for h36m_train_25fps_occ_db.pt

            dataset['features'].append(features)

    for k in dataset.keys():
        dataset[k] = np.concatenate(dataset[k])
        print(k, dataset[k].shape)

    return dataset
def read_train_data(dataset_path, debug=False):
    h, w = 2048, 2048
    dataset = {
        'vid_name': [],
        'frame_id': [],
        'joints3D': [],
        'joints2D': [],
        'bbox': [],
        'img_name': [],
        'features': [],
    }

    # occluders = load_occluders('./data/VOC2012')

    model = spin.get_pretrained_hmr()

    # training data
    user_list = range(1, 9)
    seq_list = range(1, 3)
    vid_list = list(range(3)) + list(range(4, 9))

    # product = product(user_list, seq_list, vid_list)
    # user_i, seq_i, vid_i = product[process_id]

    for user_i in user_list:
        print("Subject: ", user_i)
        for seq_i in seq_list:
            print("seq_i: ", seq_i)
            seq_path = os.path.join(dataset_path,
                                    'S' + str(user_i),
                                    'Seq' + str(seq_i))
            # mat file with annotations
            annot_file = os.path.join(seq_path, 'annot.mat')
            annot2 = sio.loadmat(annot_file)['annot2']
            annot3 = sio.loadmat(annot_file)['annot3']
            # calibration file and camera parameters
            for j, vid_i in enumerate(vid_list):
                print("vid_i: ", vid_i)
                # image folder
                imgs_path = os.path.join(seq_path,
                                         'video_' + str(vid_i))
                # per frame
                pattern = os.path.join(imgs_path, '*.jpg')
                img_list = sorted(glob.glob(pattern))
                vid_used_frames = []
                vid_used_joints = []
                vid_used_bbox = []
                vid_segments = []
                vid_uniq_id = "subj" + str(user_i) + '_seq' + str(seq_i) + "_vid" + str(vid_i) + "_seg0"
                for i, img_i in tqdm_enumerate(img_list):

                    # for each image we store the relevant annotations
                    img_name = img_i.split('/')[-1]
                    joints_2d_raw = np.reshape(annot2[vid_i][0][i], (1, 28, 2))
                    joints_2d_raw= np.append(joints_2d_raw, np.ones((1,28,1)), axis=2)
                    joints_2d = convert_kps(joints_2d_raw, "mpii3d",  "spin").reshape((-1,3))

                    joints_3d_raw = np.reshape(annot3[vid_i][0][i], (1, 28, 3)) / 1000
                    joints_3d = convert_kps(joints_3d_raw, "mpii3d", "spin").reshape((-1,3))

                    bbox = get_bbox_from_kp2d(joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4)

                    joints_3d = joints_3d - joints_3d[39]  # 4 is the root

                    # check that all joints are visible
                    x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0)
                    y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0)
                    ok_pts = np.logical_and(x_in, y_in)
                    if np.sum(ok_pts) < joints_2d.shape[0]:
                        vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1])+ "_seg" +\
                                          str(int(dataset['vid_name'][-1].split("_")[-1][3:])+1)
                        continue


                    visualize = False
                    if visualize == True and i > 500:
                        import matplotlib.pyplot as plt

                        frame = cv2.cvtColor(cv2.imread(img_i), cv2.COLOR_BGR2RGB)

                        for k in range(49):
                            kp = joints_2d[k]

                            frame = cv2.circle(
                                frame.copy(),
                                (int(kp[0]), int(kp[1])),
                                thickness=3,
                                color=(255, 0, 0),
                                radius=5,
                            )

                            cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5,
                                        (0, 255, 0),
                                        thickness=3)

                        cv2.imshow('vis', frame)
                        cv2.waitKey(0)
                        cv2.destroyAllWindows()
                        cv2.waitKey(1)

                    dataset['vid_name'].append(vid_uniq_id)
                    dataset['frame_id'].append(img_name.split(".")[0])
                    dataset['img_name'].append(img_i)
                    dataset['joints2D'].append(joints_2d)
                    dataset['joints3D'].append(joints_3d)
                    dataset['bbox'].append(bbox)
                    vid_segments.append(vid_uniq_id)
                    vid_used_frames.append(img_i)
                    vid_used_joints.append(joints_2d)
                    vid_used_bbox.append(bbox)

                vid_segments= np.array(vid_segments)
                ids = np.zeros((len(set(vid_segments))+1))
                ids[-1] = len(vid_used_frames) + 1
                if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0:
                    ids[1:-1] = (np.where(vid_segments[:-1] != vid_segments[1:])[0]) + 1

                for i in tqdm(range(len(set(vid_segments)))):
                    features = extract_features(model, None, np.array(vid_used_frames)[int(ids[i]):int(ids[i+1])],
                                                vid_used_bbox[int(ids[i]):int((ids[i+1]))],
                                                kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i+1])],
                                                dataset='spin', debug=False, scale=1.0)
                    dataset['features'].append(features)

    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])
    dataset['features'] = np.concatenate(dataset['features'])

    return dataset
def read_test_data(dataset_path):

    dataset = {
        'vid_name': [],
        'frame_id': [],
        'joints3D': [],
        'joints2D': [],
        'bbox': [],
        'img_name': [],
        'features': [],
        "valid_i": []
    }

    model = spin.get_pretrained_hmr()

    user_list = range(1, 7)

    for user_i in user_list:
        print('Subject', user_i)
        seq_path = os.path.join(dataset_path,
                                'mpi_inf_3dhp_test_set',
                                'TS' + str(user_i))
        # mat file with annotations
        annot_file = os.path.join(seq_path, 'annot_data.mat')
        mat_as_h5 = h5py.File(annot_file, 'r')
        annot2 = np.array(mat_as_h5['annot2'])
        annot3 = np.array(mat_as_h5['univ_annot3'])
        valid = np.array(mat_as_h5['valid_frame'])

        vid_used_frames = []
        vid_used_joints = []
        vid_used_bbox = []
        vid_segments = []
        vid_uniq_id = "subj" + str(user_i) + "_seg0"

        for frame_i, valid_i in tqdm(enumerate(valid)):
            img_i = os.path.join('mpi_inf_3dhp_test_set',
                                    'TS' + str(user_i),
                                    'imageSequence',
                                    'img_' + str(frame_i + 1).zfill(6) + '.jpg')

            joints_2d_raw = np.expand_dims(annot2[frame_i, 0, :, :], axis = 0)
            joints_2d_raw = np.append(joints_2d_raw, np.ones((1, 17, 1)), axis=2)


            joints_2d = convert_kps(joints_2d_raw, src="mpii3d_test", dst="spin").reshape((-1, 3))

            visualize = False
            if visualize == True:
                frame = cv2.cvtColor(cv2.imread(os.path.join(dataset_path, img_i)), cv2.COLOR_BGR2RGB)

                for k in range(49):
                    kp = joints_2d[k]

                    frame = cv2.circle(
                        frame.copy(),
                        (int(kp[0]), int(kp[1])),
                        thickness=3,
                        color=(255, 0, 0),
                        radius=5,
                    )

                    cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0),
                                thickness=3)

                cv2.imshow(f'frame:{frame_i}', frame)
                cv2.waitKey(0)
                cv2.destroyAllWindows()
                cv2.waitKey(1)


            joints_3d_raw = np.reshape(annot3[frame_i, 0, :, :], (1, 17, 3)) / 1000
            joints_3d = convert_kps(joints_3d_raw, "mpii3d_test", "spin").reshape((-1, 3))
            joints_3d = joints_3d - joints_3d[39] # substract pelvis zero is the root for test

            bbox = get_bbox_from_kp2d(joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4)

            # check that all joints are visible
            img_file = os.path.join(dataset_path, img_i)
            I = cv2.imread(img_file)
            h, w, _ = I.shape
            x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0)
            y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0)
            ok_pts = np.logical_and(x_in, y_in)

            if np.sum(ok_pts) < joints_2d.shape[0]:
                vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1]) + "_seg" + \
                              str(int(dataset['vid_name'][-1].split("_")[-1][3:]) + 1)
                continue

            dataset['vid_name'].append(vid_uniq_id)
            dataset['frame_id'].append(img_file.split("/")[-1].split(".")[0])
            dataset['img_name'].append(img_file)
            dataset['joints2D'].append(joints_2d)
            dataset['joints3D'].append(joints_3d)
            dataset['bbox'].append(bbox)
            dataset['valid_i'].append(valid_i)

            vid_segments.append(vid_uniq_id)
            vid_used_frames.append(img_file)
            vid_used_joints.append(joints_2d)
            vid_used_bbox.append(bbox)

        vid_segments = np.array(vid_segments)
        ids = np.zeros((len(set(vid_segments)) + 1))
        ids[-1] = len(vid_used_frames) + 1
        if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0:
            ids[1:-1] = (np.where(vid_segments[:-1] != vid_segments[1:])[0]) + 1

        for i in tqdm(range(len(set(vid_segments)))):
            features = extract_features(model, None, np.array(vid_used_frames)[int(ids[i]):int(ids[i + 1])],
                                        vid_used_bbox[int(ids[i]):int(ids[i + 1])],
                                        kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i + 1])],
                                        dataset='spin', debug=False, scale=1.2)  # 1.0 for mpii3d_train_scale1_db.pt
            dataset['features'].append(features)

    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])
    dataset['features'] = np.concatenate(dataset['features'])

    return dataset
Exemplo n.º 6
0
def read_data(folder, set):
    dataset = {
        'img_name': [],
        'joints2D': [],
        'bbox': [],
        'vid_name': [],
        'features': [],
    }

    # occluders = load_occluders('./data/VOC2012')

    model = spin.get_pretrained_hmr()

    file_names = glob.glob(
        osp.join(folder, 'posetrack_data/annotations/', f'{set}/*.json'))
    file_names = sorted(file_names)
    nn_corrupted = 0
    tot_frames = 0
    min_frame_number = 8

    for fid, fname in tqdm_enumerate(file_names):
        if fname == osp.join(folder,
                             'annotations/train/021133_mpii_train.json'):
            continue

        with open(fname, 'r') as entry:
            anns = json.load(entry)
        # num_frames = anns['images'][0]['nframes']
        anns['images'] = [
            item for item in anns['images'] if item['is_labeled']
        ]
        num_frames = len(anns['images'])
        frame2imgname = dict()
        for el in anns['images']:
            frame2imgname[el['frame_id']] = el['file_name']

        num_people = -1
        for x in anns['annotations']:
            if num_people < x['track_id']:
                num_people = x['track_id']
        num_people += 1
        posetrack_joints = get_posetrack_original_kp_names()
        idxs = [
            anns['categories'][0]['keypoints'].index(h)
            for h in posetrack_joints
            if h in anns['categories'][0]['keypoints']
        ]
        for x in anns['annotations']:
            kps = np.array(x['keypoints']).reshape((17, 3))
            kps = kps[idxs, :]
            x['keypoints'] = list(kps.flatten())

        tot_frames += num_people * num_frames
        for p_id in range(num_people):

            annot_pid = [(item['keypoints'], item['bbox'], item['image_id'])
                         for item in anns['annotations']
                         if item['track_id'] == p_id
                         and not (np.count_nonzero(item['keypoints']) == 0)]

            if len(annot_pid) < min_frame_number:
                nn_corrupted += len(annot_pid)
                continue

            bbox = np.zeros((len(annot_pid), 4))
            # perm_idxs = get_perm_idxs('posetrack', 'common')
            kp_2d = np.zeros((len(annot_pid), len(annot_pid[0][0]) // 3, 3))
            img_paths = np.zeros((len(annot_pid)))

            for i, (key2djnts, bbox_p, image_id) in enumerate(annot_pid):

                if (bbox_p[2] == 0 or bbox_p[3] == 0):
                    nn_corrupted += 1
                    continue

                img_paths[i] = image_id
                key2djnts[2::3] = len(key2djnts[2::3]) * [1]

                kp_2d[i, :] = np.array(key2djnts).reshape(
                    int(len(key2djnts) / 3), 3)  # [perm_idxs, :]
                for kp_loc in kp_2d[i, :]:
                    if kp_loc[0] == 0 and kp_loc[1] == 0:
                        kp_loc[2] = 0

                x_tl = bbox_p[0]
                y_tl = bbox_p[1]
                w = bbox_p[2]
                h = bbox_p[3]
                bbox_p[0] = x_tl + w / 2
                bbox_p[1] = y_tl + h / 2
                #

                w = h = np.where(w / h > 1, w, h)
                w = h = h * 0.8
                bbox_p[2] = w
                bbox_p[3] = h
                bbox[i, :] = bbox_p

            img_paths = list(img_paths)
            img_paths = [
                osp.join(folder, frame2imgname[item]) if item != 0 else 0
                for item in img_paths
            ]

            bbx_idxs = []
            for bbx_id, bbx in enumerate(bbox):
                if np.count_nonzero(bbx) == 0:
                    bbx_idxs += [bbx_id]

            kp_2d = np.delete(kp_2d, bbx_idxs, 0)
            img_paths = np.delete(np.array(img_paths), bbx_idxs, 0)
            bbox = np.delete(bbox, np.where(~bbox.any(axis=1))[0], axis=0)

            # Convert to common 2d keypoint format
            if bbox.size == 0 or bbox.shape[0] < min_frame_number:
                nn_corrupted += 1
                continue

            kp_2d = convert_kps(kp_2d, src='posetrack', dst='spin')

            dataset['vid_name'].append(
                np.array([f'{fname}_{p_id}'] * img_paths.shape[0]))
            dataset['img_name'].append(np.array(img_paths))
            dataset['joints2D'].append(kp_2d)
            dataset['bbox'].append(np.array(bbox))

            # compute_features
            features = extract_features(
                model,
                None,
                np.array(img_paths),
                bbox,
                kp_2d=kp_2d,
                dataset='spin',
                debug=False,
            )

            assert kp_2d.shape[0] == img_paths.shape[0] == bbox.shape[0]

            dataset['features'].append(features)

    print(nn_corrupted, tot_frames)
    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])

    for k in dataset.keys():
        dataset[k] = np.concatenate(dataset[k])

    for k, v in dataset.items():
        print(k, v.shape)

    return dataset
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        is_train = self.set == 'train'

        if self.dataset_name == '3dpw':
            kp_2d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints2D']), src='common', dst='spin')
            kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D'])

        elif self.dataset_name == 'mpii3d':
            kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D'])
            if is_train:
                kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D'])

            else:
                kp_3d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints3D']), src='spin', dst='mpii3d_test')
        elif self.dataset_name == 'h36m':
            kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D'])
            if is_train:
                kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D'])
            else:
                kp_3d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints3D']), src='spin', dst='common')

        kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)
        if is_train:
            nj = 49
        else:
            if self.dataset_name == 'mpii3d':
                nj = 17
            else:
                nj =14

        kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16)

        if self.dataset_name == '3dpw':
            pose = self.get_sequence(start_index, end_index, self.db['pose'])
            shape = self.get_sequence(start_index, end_index, self.db['shape'])

            w_smpl = torch.ones(self.seqlen).float()
            w_3d = torch.ones(self.seqlen).float()
        elif self.dataset_name == 'h36m':
            if not is_train:
                pose = np.zeros((kp_2d.shape[0], 72))
                shape = np.zeros((kp_2d.shape[0], 10))
                w_smpl = torch.zeros(self.seqlen).float()
                w_3d = torch.ones(self.seqlen).float()
            else:
                pose = self.get_sequence(start_index, end_index, self.db['pose'])
                shape = self.get_sequence(start_index, end_index, self.db['shape'])
                # SMPL parameters obtained by NeuralAnnot will be released (https://arxiv.org/abs/2011.11232) after publication
                # w_smpl = torch.ones(self.seqlen).float()
                w_smpl = torch.zeros(self.seqlen).float()
                w_3d = torch.ones(self.seqlen).float()
        elif self.dataset_name == 'mpii3d':
            pose = np.zeros((kp_2d.shape[0], 72))
            shape = np.zeros((kp_2d.shape[0], 10))
            w_smpl = torch.zeros(self.seqlen).float()
            w_3d = torch.ones(self.seqlen).float()

        bbox = self.get_sequence(start_index, end_index, self.db['bbox'])
        # img_names = self.get_sequence(start_index, end_index, self.db['img_name'])
        # video = torch.cat(
        #     [get_single_image_crop(image, None, bbox, scale=1.2).unsqueeze(0) for idx, (image, bbox) in
        #      enumerate(zip(img_names, bbox))], dim=0
        # )
        input = torch.from_numpy(self.get_sequence(start_index, end_index, self.db['features'])).float()

        theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16)

        for idx in range(self.seqlen):
            # crop image and transform 2d keypoints
            kp_2d[idx,:,:2], trans = transfrom_keypoints(
                kp_2d=kp_2d[idx,:,:2],
                center_x=bbox[idx,0],
                center_y=bbox[idx,1],
                width=bbox[idx,2],
                height=bbox[idx,3],
                patch_width=224,
                patch_height=224,
                do_augment=False,
            )

            kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224)

            # theta shape (85,)
            theta = np.concatenate((np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0)

            kp_2d_tensor[idx] = kp_2d[idx]
            theta_tensor[idx] = theta
            kp_3d_tensor[idx] = kp_3d[idx]

        # (N-2)xnjx3
        # accel_gt = kp_3d_tensor[:-2] - 2 * kp_3d_tensor[1:-1] + kp_3d_tensor[2:]
        # accel_gt = np.linalg.norm(accel_gt, axis=2) # (N-2)xnj

        repeat_num = 3
        target = {
            'features': input,
            'theta': torch.from_numpy(theta_tensor).float()[self.mid_frame].repeat(repeat_num, 1), # camera, pose and shape
            'kp_2d': torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat(repeat_num, 1, 1), # 2D keypoints transformed according to bbox cropping
            'kp_3d': torch.from_numpy(kp_3d_tensor).float()[self.mid_frame].repeat(repeat_num, 1, 1), # 3D keypoints
            'w_smpl': w_smpl[self.mid_frame].repeat(repeat_num),
            'w_3d': w_3d[self.mid_frame].repeat(repeat_num),
        }

        if self.dataset_name == 'mpii3d' and not is_train:
            target['valid'] = self.get_sequence(start_index, end_index, self.db['valid_i'])[self.mid_frame]
            target['theta'] = target['theta'][0]
            target['kp_2d'] = target['kp_2d'][0]
            target['kp_3d'] = target['kp_3d'][0]
            target['w_smpl'] = target['w_smpl'][0]
            target['w_3d'] = target['w_3d'][0]

        if self.dataset_name == 'h36m' and not is_train:
            target['valid'] = np.ones(1, dtype=np.float32)
            target['theta'] = target['theta'][0]
            target['kp_2d'] = target['kp_2d'][0]
            target['kp_3d'] = target['kp_3d'][0]
            target['w_smpl'] = target['w_smpl'][0]
            target['w_3d'] = target['w_3d'][0]

            vn = self.get_sequence(start_index, end_index, self.db['vid_name'])
            fi = self.get_sequence(start_index, end_index, self.db['frame_id'])

            target['instance_id'] = [f'{v}_{f:06d}'.split('/')[-1] for v, f in zip(vn, fi)]
            target['bbox'] = bbox[self.mid_frame]
            target['imgname'] = self.get_sequence(start_index, end_index, self.db['img_name']).tolist()

        if self.dataset_name == '3dpw' and not is_train:
            target['valid'] = np.ones(1, dtype=np.float32)
            target['theta'] = target['theta'][1]
            target['kp_2d'] = target['kp_2d'][1]
            target['kp_3d'] = target['kp_3d'][1]
            target['w_smpl'] = target['w_smpl'][1]
            target['w_3d'] = target['w_3d'][1]

            vn = self.get_sequence(start_index, end_index, self.db['vid_name'])
            fi = self.get_sequence(start_index, end_index, self.db['frame_id'])

            target['instance_id'] = [f'{v}_{f:06d}' for v,f in zip(vn,fi)]
            target['bbox'] = bbox[self.mid_frame]
            target['imgname'] = self.get_sequence(start_index, end_index, self.db['img_name']).tolist()

        if self.debug:
            if self.dataset_name == 'mpii3d':
                video = self.get_sequence(start_index, end_index, self.db['img_name'])
                # print(video)
            elif self.dataset_name == 'h36m':
                video = self.get_sequence(start_index, end_index, self.db['img_name'])
            else:
                vid_name = self.db['vid_name'][start_index]
                vid_name = '_'.join(vid_name.split('_')[:-1])
                f = osp.join(self.folder, 'imageFiles', vid_name)
                video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')]
                frame_idxs = self.get_sequence(start_index, end_index, self.db['frame_id'])
                video = [video_file_list[i] for i in frame_idxs]

            video = torch.cat(
                [get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0
            )

            target['video'] = video

        return target