def get_single_item(self, index): start_index, end_index = self.vid_indices[index] with h5py.File(self.h5_file, 'r') as db: self.db = db kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D']) kp_2d = convert_kps(kp_2d, src='insta', dst='spin') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) input = torch.from_numpy( self.get_sequence(start_index, end_index, self.db['features'])).float() vid_name = self.get_sequence(start_index, end_index, self.db['vid_name']) frame_id = self.get_sequence(start_index, end_index, self.db['frame_id']).astype(str) instance_id = np.array( [v.decode('ascii') + f for v, f in zip(vid_name, frame_id)]) for idx in range(self.seqlen): kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224) kp_2d_tensor[idx] = kp_2d[idx] repeat_num = 3 target = { 'features': input, 'kp_2d': torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat( repeat_num, 1, 1), # 2D keypoints transformed according to bbox cropping # 'instance_id': instance_id } return target
def get_single_item(self, index): start_index, end_index = self.vid_indices[index] kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D']) if self.dataset_name != 'posetrack': kp_2d = convert_kps(kp_2d, src=self.dataset_name, dst='spin') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) bbox = self.get_sequence(start_index, end_index, self.db['bbox']) input = torch.from_numpy( self.get_sequence(start_index, end_index, self.db['features'])).float() for idx in range(self.seqlen): # crop image and transform 2d keypoints kp_2d[idx, :, :2], trans = transfrom_keypoints( kp_2d=kp_2d[idx, :, :2], center_x=bbox[idx, 0], center_y=bbox[idx, 1], width=bbox[idx, 2], height=bbox[idx, 3], patch_width=224, patch_height=224, do_augment=False, ) kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224) kp_2d_tensor[idx] = kp_2d[idx] vid_name = self.get_sequence(start_index, end_index, self.db['vid_name']) frame_id = self.get_sequence(start_index, end_index, self.db['img_name']).astype(str) instance_id = np.array([v + f for v, f in zip(vid_name, frame_id)]) bbox = self.get_sequence(start_index, end_index, self.db['bbox']) # video = torch.cat( # [get_single_image_crop(image, None, bbox, scale=1.2).unsqueeze(0) for idx, (image, bbox) in # enumerate(zip(frame_id, bbox))], dim=0 # ) repeat_num = 3 target = { 'features': input, 'kp_2d': torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat( repeat_num, 1, 1), # 2D keypoints transformed according to bbox cropping # 'instance_id': instance_id, } if self.debug: vid_name = self.db['vid_name'][start_index] if self.dataset_name == 'pennaction': vid_folder = "frames" vid_name = vid_name.split('/')[-1].split('.')[0] img_id = "img_name" elif self.dataset_name == 'posetrack': vid_folder = osp.join('images', vid_name.split('/')[-2]) vid_name = vid_name.split('/')[-1].split('.')[0] img_id = "img_name" else: vid_name = '_'.join(vid_name.split('_')[:-1]) vid_folder = 'imageFiles' img_id = 'frame_id' f = osp.join(self.folder, vid_folder, vid_name) video_file_list = [ osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg') ] frame_idxs = self.get_sequence(start_index, end_index, self.db[img_id]) if self.dataset_name == 'pennaction' or self.dataset_name == 'posetrack': video = frame_idxs else: video = [video_file_list[i] for i in frame_idxs] video = torch.cat([ get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox) ], dim=0) target['video'] = video return target
def read_data_train(dataset_path, set='train', debug=False): dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'shape': [], 'pose': [], 'bbox': [], 'img_name': [], 'features': [], } # occluders = load_occluders('./data/VOC2012') model = spin.get_pretrained_hmr() if set == 'train': subjects = [1, 5, 6, 7, 8] else: subjects = [9, 11] for subject in subjects: annot_path = osp.join(dataset_path, 'annotations') # camera load with open( osp.join(annot_path, 'Human36M_subject' + str(subject) + '_camera.json'), 'r') as f: cameras = json.load(f) # joint coordinate load with open( osp.join(annot_path, 'Human36M_subject' + str(subject) + '_joint_3d.json'), 'r') as f: joints = json.load(f) # SMPL parameters obtained by NeuralAnnot will be released (https://arxiv.org/abs/2011.11232) after publication # # smpl parameter load # with open(osp.join(annot_path, 'Human36M_subject' + str(subject) + '_SMPL_NeuralAnnot.json'), 'r') as f: # smpl_params = json.load(f) seq_list = sorted(glob.glob(dataset_path + f'/images/s_{subject:02d}*')) for seq in tqdm(seq_list): seq_name = seq.split('/')[-1] act = str(int(seq_name.split('_act_')[-1][0:2])) subact = str(int(seq_name.split('_subact_')[-1][0:2])) cam = str(int(seq_name.split('_ca_')[-1][0:2])) # if cam != '4': # front camera (Table 6) # continue print("seq name: ", seq) img_paths = sorted(glob.glob(seq + '/*.jpg')) num_frames = len(img_paths) if num_frames < 1: continue # camera parameter cam_param = cameras[cam] R, t, f, c = np.array(cam_param['R'], dtype=np.float32), np.array( cam_param['t'], dtype=np.float32), np.array( cam_param['f'], dtype=np.float32), np.array(cam_param['c'], dtype=np.float32) # img starts from index 1, and annot starts from index 0 poses = np.zeros((num_frames, 72), dtype=np.float32) shapes = np.zeros((num_frames, 10), dtype=np.float32) j3ds = np.zeros((num_frames, 49, 3), dtype=np.float32) j2ds = np.zeros((num_frames, 49, 3), dtype=np.float32) for img_i in tqdm(range(num_frames)): # smpl_param = smpl_params[act][subact][str(img_i)][cam] # pose = np.array(smpl_param['pose'], dtype=np.float32) # shape = np.array(smpl_param['shape'], dtype=np.float32) joint_world = np.array(joints[act][subact][str(img_i)], dtype=np.float32) # match right, left match = [[1, 4], [2, 5], [3, 6]] for m in match: l, r = m joint_world[l], joint_world[r] = joint_world[r].copy( ), joint_world[l].copy() joint_cam = world2cam(joint_world, R, t) joint_img = cam2pixel(joint_cam, f, c) j3d = convert_kps(joint_cam[None, :, :] / 1000, "h36m", "spin").reshape((-1, 3)) j3d = j3d - j3d[39] # 4 is the root joint_img[:, 2] = 1 j2d = convert_kps(joint_img[None, :, :], "h36m", "spin").reshape((-1, 3)) # poses[img_i] = pose # shapes[img_i] = shape j3ds[img_i] = j3d j2ds[img_i] = j2d """ import torch smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False) p = torch.from_numpy(pose).float().reshape(1,-1,3) s = torch.from_numpy(shape).float().reshape(1,-1) J_regressor = torch.from_numpy(np.load(osp.join(TCMR_DATA_DIR, 'J_regressor_h36m.npy'))).float() output = smpl(betas=s, body_pose=p[:, 3:], global_orient=p[:, :3]) vertices = output.vertices J_regressor_batch = J_regressor[None, :].expand(vertices.shape[0], -1, -1).to(vertices.device) temp_j3d = torch.matmul(J_regressor_batch, vertices) * 1000 # temp_j3d = temp_j3d - temp_j3d[:, 0, :] temp_j3d = temp_j3d[0, H36M_TO_J14, :] gt_j3d = joint_cam - joint_cam[0, :] gt_j3d = gt_j3d[H36M_TO_J14, :] print("CHECK: ", (temp_j3d-gt_j3d)) """ bbox_params, time_pt1, time_pt2 = get_smooth_bbox_params( j2ds, vis_thresh=VIS_THRESH, sigma=8) # bbox_params, time_pt1, time_pt2 = get_all_bbox_params(j2ds, vis_thresh=VIS_THRESH) """ img = cv2.imread(img_paths[0]) temp = draw_skeleton(img, j2ds[0], dataset='spin', unnormalize=False, thickness=2) cv2.imshow('img', temp) cv2.waitKey(0) cv2.destroyAllWindows() cv2.waitKey(1) """ # process bbox_params c_x = bbox_params[:, 0] c_y = bbox_params[:, 1] scale = bbox_params[:, 2] w = h = 150. / scale w = h = h * 0.9 # 1.1 for h36m_train_25fps_occ_db.pt bbox = np.vstack([c_x, c_y, w, h]).T img_paths_array = np.array(img_paths)[time_pt1:time_pt2][::2] bbox = bbox[::2] # subsample frame to 25 fps dataset['vid_name'].append( np.array([f'{seq}_{subject}'] * num_frames)[time_pt1:time_pt2][::2]) dataset['frame_id'].append( np.arange(0, num_frames)[time_pt1:time_pt2][::2]) dataset['joints3D'].append(j3ds[time_pt1:time_pt2][::2]) dataset['joints2D'].append(j2ds[time_pt1:time_pt2][::2]) dataset['shape'].append(shapes[time_pt1:time_pt2][::2]) dataset['pose'].append(poses[time_pt1:time_pt2][::2]) dataset['img_name'].append(img_paths_array) dataset['bbox'].append(bbox) features = extract_features( model, None, img_paths_array, bbox, kp_2d=j2ds[time_pt1:time_pt2][::2], debug=debug, dataset='h36m', scale=1.0) # 1.2 for h36m_train_25fps_occ_db.pt dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.concatenate(dataset[k]) print(k, dataset[k].shape) return dataset
def read_train_data(dataset_path, debug=False): h, w = 2048, 2048 dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'bbox': [], 'img_name': [], 'features': [], } # occluders = load_occluders('./data/VOC2012') model = spin.get_pretrained_hmr() # training data user_list = range(1, 9) seq_list = range(1, 3) vid_list = list(range(3)) + list(range(4, 9)) # product = product(user_list, seq_list, vid_list) # user_i, seq_i, vid_i = product[process_id] for user_i in user_list: print("Subject: ", user_i) for seq_i in seq_list: print("seq_i: ", seq_i) seq_path = os.path.join(dataset_path, 'S' + str(user_i), 'Seq' + str(seq_i)) # mat file with annotations annot_file = os.path.join(seq_path, 'annot.mat') annot2 = sio.loadmat(annot_file)['annot2'] annot3 = sio.loadmat(annot_file)['annot3'] # calibration file and camera parameters for j, vid_i in enumerate(vid_list): print("vid_i: ", vid_i) # image folder imgs_path = os.path.join(seq_path, 'video_' + str(vid_i)) # per frame pattern = os.path.join(imgs_path, '*.jpg') img_list = sorted(glob.glob(pattern)) vid_used_frames = [] vid_used_joints = [] vid_used_bbox = [] vid_segments = [] vid_uniq_id = "subj" + str(user_i) + '_seq' + str(seq_i) + "_vid" + str(vid_i) + "_seg0" for i, img_i in tqdm_enumerate(img_list): # for each image we store the relevant annotations img_name = img_i.split('/')[-1] joints_2d_raw = np.reshape(annot2[vid_i][0][i], (1, 28, 2)) joints_2d_raw= np.append(joints_2d_raw, np.ones((1,28,1)), axis=2) joints_2d = convert_kps(joints_2d_raw, "mpii3d", "spin").reshape((-1,3)) joints_3d_raw = np.reshape(annot3[vid_i][0][i], (1, 28, 3)) / 1000 joints_3d = convert_kps(joints_3d_raw, "mpii3d", "spin").reshape((-1,3)) bbox = get_bbox_from_kp2d(joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4) joints_3d = joints_3d - joints_3d[39] # 4 is the root # check that all joints are visible x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0) y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0) ok_pts = np.logical_and(x_in, y_in) if np.sum(ok_pts) < joints_2d.shape[0]: vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1])+ "_seg" +\ str(int(dataset['vid_name'][-1].split("_")[-1][3:])+1) continue visualize = False if visualize == True and i > 500: import matplotlib.pyplot as plt frame = cv2.cvtColor(cv2.imread(img_i), cv2.COLOR_BGR2RGB) for k in range(49): kp = joints_2d[k] frame = cv2.circle( frame.copy(), (int(kp[0]), int(kp[1])), thickness=3, color=(255, 0, 0), radius=5, ) cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), thickness=3) cv2.imshow('vis', frame) cv2.waitKey(0) cv2.destroyAllWindows() cv2.waitKey(1) dataset['vid_name'].append(vid_uniq_id) dataset['frame_id'].append(img_name.split(".")[0]) dataset['img_name'].append(img_i) dataset['joints2D'].append(joints_2d) dataset['joints3D'].append(joints_3d) dataset['bbox'].append(bbox) vid_segments.append(vid_uniq_id) vid_used_frames.append(img_i) vid_used_joints.append(joints_2d) vid_used_bbox.append(bbox) vid_segments= np.array(vid_segments) ids = np.zeros((len(set(vid_segments))+1)) ids[-1] = len(vid_used_frames) + 1 if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0: ids[1:-1] = (np.where(vid_segments[:-1] != vid_segments[1:])[0]) + 1 for i in tqdm(range(len(set(vid_segments)))): features = extract_features(model, None, np.array(vid_used_frames)[int(ids[i]):int(ids[i+1])], vid_used_bbox[int(ids[i]):int((ids[i+1]))], kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i+1])], dataset='spin', debug=False, scale=1.0) dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) dataset['features'] = np.concatenate(dataset['features']) return dataset
def read_test_data(dataset_path): dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'bbox': [], 'img_name': [], 'features': [], "valid_i": [] } model = spin.get_pretrained_hmr() user_list = range(1, 7) for user_i in user_list: print('Subject', user_i) seq_path = os.path.join(dataset_path, 'mpi_inf_3dhp_test_set', 'TS' + str(user_i)) # mat file with annotations annot_file = os.path.join(seq_path, 'annot_data.mat') mat_as_h5 = h5py.File(annot_file, 'r') annot2 = np.array(mat_as_h5['annot2']) annot3 = np.array(mat_as_h5['univ_annot3']) valid = np.array(mat_as_h5['valid_frame']) vid_used_frames = [] vid_used_joints = [] vid_used_bbox = [] vid_segments = [] vid_uniq_id = "subj" + str(user_i) + "_seg0" for frame_i, valid_i in tqdm(enumerate(valid)): img_i = os.path.join('mpi_inf_3dhp_test_set', 'TS' + str(user_i), 'imageSequence', 'img_' + str(frame_i + 1).zfill(6) + '.jpg') joints_2d_raw = np.expand_dims(annot2[frame_i, 0, :, :], axis = 0) joints_2d_raw = np.append(joints_2d_raw, np.ones((1, 17, 1)), axis=2) joints_2d = convert_kps(joints_2d_raw, src="mpii3d_test", dst="spin").reshape((-1, 3)) visualize = False if visualize == True: frame = cv2.cvtColor(cv2.imread(os.path.join(dataset_path, img_i)), cv2.COLOR_BGR2RGB) for k in range(49): kp = joints_2d[k] frame = cv2.circle( frame.copy(), (int(kp[0]), int(kp[1])), thickness=3, color=(255, 0, 0), radius=5, ) cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), thickness=3) cv2.imshow(f'frame:{frame_i}', frame) cv2.waitKey(0) cv2.destroyAllWindows() cv2.waitKey(1) joints_3d_raw = np.reshape(annot3[frame_i, 0, :, :], (1, 17, 3)) / 1000 joints_3d = convert_kps(joints_3d_raw, "mpii3d_test", "spin").reshape((-1, 3)) joints_3d = joints_3d - joints_3d[39] # substract pelvis zero is the root for test bbox = get_bbox_from_kp2d(joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4) # check that all joints are visible img_file = os.path.join(dataset_path, img_i) I = cv2.imread(img_file) h, w, _ = I.shape x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0) y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0) ok_pts = np.logical_and(x_in, y_in) if np.sum(ok_pts) < joints_2d.shape[0]: vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1]) + "_seg" + \ str(int(dataset['vid_name'][-1].split("_")[-1][3:]) + 1) continue dataset['vid_name'].append(vid_uniq_id) dataset['frame_id'].append(img_file.split("/")[-1].split(".")[0]) dataset['img_name'].append(img_file) dataset['joints2D'].append(joints_2d) dataset['joints3D'].append(joints_3d) dataset['bbox'].append(bbox) dataset['valid_i'].append(valid_i) vid_segments.append(vid_uniq_id) vid_used_frames.append(img_file) vid_used_joints.append(joints_2d) vid_used_bbox.append(bbox) vid_segments = np.array(vid_segments) ids = np.zeros((len(set(vid_segments)) + 1)) ids[-1] = len(vid_used_frames) + 1 if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0: ids[1:-1] = (np.where(vid_segments[:-1] != vid_segments[1:])[0]) + 1 for i in tqdm(range(len(set(vid_segments)))): features = extract_features(model, None, np.array(vid_used_frames)[int(ids[i]):int(ids[i + 1])], vid_used_bbox[int(ids[i]):int(ids[i + 1])], kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i + 1])], dataset='spin', debug=False, scale=1.2) # 1.0 for mpii3d_train_scale1_db.pt dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) dataset['features'] = np.concatenate(dataset['features']) return dataset
def read_data(folder, set): dataset = { 'img_name': [], 'joints2D': [], 'bbox': [], 'vid_name': [], 'features': [], } # occluders = load_occluders('./data/VOC2012') model = spin.get_pretrained_hmr() file_names = glob.glob( osp.join(folder, 'posetrack_data/annotations/', f'{set}/*.json')) file_names = sorted(file_names) nn_corrupted = 0 tot_frames = 0 min_frame_number = 8 for fid, fname in tqdm_enumerate(file_names): if fname == osp.join(folder, 'annotations/train/021133_mpii_train.json'): continue with open(fname, 'r') as entry: anns = json.load(entry) # num_frames = anns['images'][0]['nframes'] anns['images'] = [ item for item in anns['images'] if item['is_labeled'] ] num_frames = len(anns['images']) frame2imgname = dict() for el in anns['images']: frame2imgname[el['frame_id']] = el['file_name'] num_people = -1 for x in anns['annotations']: if num_people < x['track_id']: num_people = x['track_id'] num_people += 1 posetrack_joints = get_posetrack_original_kp_names() idxs = [ anns['categories'][0]['keypoints'].index(h) for h in posetrack_joints if h in anns['categories'][0]['keypoints'] ] for x in anns['annotations']: kps = np.array(x['keypoints']).reshape((17, 3)) kps = kps[idxs, :] x['keypoints'] = list(kps.flatten()) tot_frames += num_people * num_frames for p_id in range(num_people): annot_pid = [(item['keypoints'], item['bbox'], item['image_id']) for item in anns['annotations'] if item['track_id'] == p_id and not (np.count_nonzero(item['keypoints']) == 0)] if len(annot_pid) < min_frame_number: nn_corrupted += len(annot_pid) continue bbox = np.zeros((len(annot_pid), 4)) # perm_idxs = get_perm_idxs('posetrack', 'common') kp_2d = np.zeros((len(annot_pid), len(annot_pid[0][0]) // 3, 3)) img_paths = np.zeros((len(annot_pid))) for i, (key2djnts, bbox_p, image_id) in enumerate(annot_pid): if (bbox_p[2] == 0 or bbox_p[3] == 0): nn_corrupted += 1 continue img_paths[i] = image_id key2djnts[2::3] = len(key2djnts[2::3]) * [1] kp_2d[i, :] = np.array(key2djnts).reshape( int(len(key2djnts) / 3), 3) # [perm_idxs, :] for kp_loc in kp_2d[i, :]: if kp_loc[0] == 0 and kp_loc[1] == 0: kp_loc[2] = 0 x_tl = bbox_p[0] y_tl = bbox_p[1] w = bbox_p[2] h = bbox_p[3] bbox_p[0] = x_tl + w / 2 bbox_p[1] = y_tl + h / 2 # w = h = np.where(w / h > 1, w, h) w = h = h * 0.8 bbox_p[2] = w bbox_p[3] = h bbox[i, :] = bbox_p img_paths = list(img_paths) img_paths = [ osp.join(folder, frame2imgname[item]) if item != 0 else 0 for item in img_paths ] bbx_idxs = [] for bbx_id, bbx in enumerate(bbox): if np.count_nonzero(bbx) == 0: bbx_idxs += [bbx_id] kp_2d = np.delete(kp_2d, bbx_idxs, 0) img_paths = np.delete(np.array(img_paths), bbx_idxs, 0) bbox = np.delete(bbox, np.where(~bbox.any(axis=1))[0], axis=0) # Convert to common 2d keypoint format if bbox.size == 0 or bbox.shape[0] < min_frame_number: nn_corrupted += 1 continue kp_2d = convert_kps(kp_2d, src='posetrack', dst='spin') dataset['vid_name'].append( np.array([f'{fname}_{p_id}'] * img_paths.shape[0])) dataset['img_name'].append(np.array(img_paths)) dataset['joints2D'].append(kp_2d) dataset['bbox'].append(np.array(bbox)) # compute_features features = extract_features( model, None, np.array(img_paths), bbox, kp_2d=kp_2d, dataset='spin', debug=False, ) assert kp_2d.shape[0] == img_paths.shape[0] == bbox.shape[0] dataset['features'].append(features) print(nn_corrupted, tot_frames) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) for k in dataset.keys(): dataset[k] = np.concatenate(dataset[k]) for k, v in dataset.items(): print(k, v.shape) return dataset
def get_single_item(self, index): start_index, end_index = self.vid_indices[index] is_train = self.set == 'train' if self.dataset_name == '3dpw': kp_2d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints2D']), src='common', dst='spin') kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D']) elif self.dataset_name == 'mpii3d': kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D']) if is_train: kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D']) else: kp_3d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints3D']), src='spin', dst='mpii3d_test') elif self.dataset_name == 'h36m': kp_2d = self.get_sequence(start_index, end_index, self.db['joints2D']) if is_train: kp_3d = self.get_sequence(start_index, end_index, self.db['joints3D']) else: kp_3d = convert_kps(self.get_sequence(start_index, end_index, self.db['joints3D']), src='spin', dst='common') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) if is_train: nj = 49 else: if self.dataset_name == 'mpii3d': nj = 17 else: nj =14 kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16) if self.dataset_name == '3dpw': pose = self.get_sequence(start_index, end_index, self.db['pose']) shape = self.get_sequence(start_index, end_index, self.db['shape']) w_smpl = torch.ones(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'h36m': if not is_train: pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() else: pose = self.get_sequence(start_index, end_index, self.db['pose']) shape = self.get_sequence(start_index, end_index, self.db['shape']) # SMPL parameters obtained by NeuralAnnot will be released (https://arxiv.org/abs/2011.11232) after publication # w_smpl = torch.ones(self.seqlen).float() w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'mpii3d': pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() bbox = self.get_sequence(start_index, end_index, self.db['bbox']) # img_names = self.get_sequence(start_index, end_index, self.db['img_name']) # video = torch.cat( # [get_single_image_crop(image, None, bbox, scale=1.2).unsqueeze(0) for idx, (image, bbox) in # enumerate(zip(img_names, bbox))], dim=0 # ) input = torch.from_numpy(self.get_sequence(start_index, end_index, self.db['features'])).float() theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16) for idx in range(self.seqlen): # crop image and transform 2d keypoints kp_2d[idx,:,:2], trans = transfrom_keypoints( kp_2d=kp_2d[idx,:,:2], center_x=bbox[idx,0], center_y=bbox[idx,1], width=bbox[idx,2], height=bbox[idx,3], patch_width=224, patch_height=224, do_augment=False, ) kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) # theta shape (85,) theta = np.concatenate((np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0) kp_2d_tensor[idx] = kp_2d[idx] theta_tensor[idx] = theta kp_3d_tensor[idx] = kp_3d[idx] # (N-2)xnjx3 # accel_gt = kp_3d_tensor[:-2] - 2 * kp_3d_tensor[1:-1] + kp_3d_tensor[2:] # accel_gt = np.linalg.norm(accel_gt, axis=2) # (N-2)xnj repeat_num = 3 target = { 'features': input, 'theta': torch.from_numpy(theta_tensor).float()[self.mid_frame].repeat(repeat_num, 1), # camera, pose and shape 'kp_2d': torch.from_numpy(kp_2d_tensor).float()[self.mid_frame].repeat(repeat_num, 1, 1), # 2D keypoints transformed according to bbox cropping 'kp_3d': torch.from_numpy(kp_3d_tensor).float()[self.mid_frame].repeat(repeat_num, 1, 1), # 3D keypoints 'w_smpl': w_smpl[self.mid_frame].repeat(repeat_num), 'w_3d': w_3d[self.mid_frame].repeat(repeat_num), } if self.dataset_name == 'mpii3d' and not is_train: target['valid'] = self.get_sequence(start_index, end_index, self.db['valid_i'])[self.mid_frame] target['theta'] = target['theta'][0] target['kp_2d'] = target['kp_2d'][0] target['kp_3d'] = target['kp_3d'][0] target['w_smpl'] = target['w_smpl'][0] target['w_3d'] = target['w_3d'][0] if self.dataset_name == 'h36m' and not is_train: target['valid'] = np.ones(1, dtype=np.float32) target['theta'] = target['theta'][0] target['kp_2d'] = target['kp_2d'][0] target['kp_3d'] = target['kp_3d'][0] target['w_smpl'] = target['w_smpl'][0] target['w_3d'] = target['w_3d'][0] vn = self.get_sequence(start_index, end_index, self.db['vid_name']) fi = self.get_sequence(start_index, end_index, self.db['frame_id']) target['instance_id'] = [f'{v}_{f:06d}'.split('/')[-1] for v, f in zip(vn, fi)] target['bbox'] = bbox[self.mid_frame] target['imgname'] = self.get_sequence(start_index, end_index, self.db['img_name']).tolist() if self.dataset_name == '3dpw' and not is_train: target['valid'] = np.ones(1, dtype=np.float32) target['theta'] = target['theta'][1] target['kp_2d'] = target['kp_2d'][1] target['kp_3d'] = target['kp_3d'][1] target['w_smpl'] = target['w_smpl'][1] target['w_3d'] = target['w_3d'][1] vn = self.get_sequence(start_index, end_index, self.db['vid_name']) fi = self.get_sequence(start_index, end_index, self.db['frame_id']) target['instance_id'] = [f'{v}_{f:06d}' for v,f in zip(vn,fi)] target['bbox'] = bbox[self.mid_frame] target['imgname'] = self.get_sequence(start_index, end_index, self.db['img_name']).tolist() if self.debug: if self.dataset_name == 'mpii3d': video = self.get_sequence(start_index, end_index, self.db['img_name']) # print(video) elif self.dataset_name == 'h36m': video = self.get_sequence(start_index, end_index, self.db['img_name']) else: vid_name = self.db['vid_name'][start_index] vid_name = '_'.join(vid_name.split('_')[:-1]) f = osp.join(self.folder, 'imageFiles', vid_name) video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')] frame_idxs = self.get_sequence(start_index, end_index, self.db['frame_id']) video = [video_file_list[i] for i in frame_idxs] video = torch.cat( [get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0 ) target['video'] = video return target