def transform(self, points: torch.DoubleTensor): points = ensure_homogeneous(points, d=3) if len(self.shuffle_indices) > 0: index = torch.LongTensor( self.shuffle_indices).unsqueeze(-1).expand_as(points) points = points.gather(-2, index) return torch.mm(points, self.matrix.t())
def load_and_process_example(dataset, example_index, device, model): example = load_example(dataset, example_index) if model is None: return example in_var = example['input'].unsqueeze(0).to(device, torch.float32) out_var = model(in_var) pred_skel_norm = ensure_homogeneous(out_var.squeeze(0).to( CPU, torch.float64), d=3) pred_skel_denorm = dataset.denormalise_with_skeleton_height( pred_skel_norm, example['camera'], example['transform_opts']) pred_skel_image_space = example['camera'].project_cartesian( pred_skel_denorm) pred_skel_camera_space = dataset.untransform_skeleton( pred_skel_denorm, example['transform_opts']) return dict(pred_skel=dict( normalised=pred_skel_norm, camera_space=pred_skel_camera_space, image_space=pred_skel_image_space, ), xy_heatmaps=[ hm.squeeze(0).to(CPU, torch.float32) for hm in model.xy_heatmaps ], zy_heatmaps=[ hm.squeeze(0).to(CPU, torch.float32) for hm in model.zy_heatmaps ], xz_heatmaps=[ hm.squeeze(0).to(CPU, torch.float32) for hm in model.xz_heatmaps ], **example)
def do_validation_pass(epoch, model, tel, loader): vis_images = None model.eval() with torch.no_grad(): for batch in progress_iter(loader, 'Validation'): in_var = batch['input'].to(global_opts['device'], torch.float32) target_var = batch['target'].to(global_opts['device'], torch.float32) mask_var = batch['joint_mask'].to(global_opts['device'], torch.float32) # Calculate predictions and loss out_var = model(in_var) loss = forward_loss(model, out_var, target_var, mask_var, batch['valid_depth']) tel['val_loss'].add(loss.sum().item()) calculate_performance_metrics( batch, loader.dataset, ensure_homogeneous(out_var.to(CPU, torch.float64).detach(), d=3), tel['val_mpjpe'], tel['val_pck']) if vis_images is None: preds = out_var.to(CPU, torch.float64).detach() vis_images = visualise_predictions(preds, batch, loader.dataset) tel['val_examples'].set_value(vis_images[:8])
def canonicalise_orientation(skel_desc, skel): """Rotate the skeleton into a canonical orientation. This is achieved by aligning the plane formed by the left shoulder, right shoulder, and pelvis joints with the XY plane. The root joint is positioned at the origin. The direction from the pelvis to the midpoint of the soldiers is aligned with the negative Y direction. "Forwards" for the skeleton corresponds to the negative Z direction. Args: skel_desc (SkeletonDesc): The skeleton description skel (torch.Tensor): The skeleton Returns: The re-oriented skeleton """ skel = ensure_homogeneous(skel, d=3) cart_skel = homogeneous_to_cartesian(skel) cart_skel = cart_skel - cart_skel[skel_desc.root_joint_id] rshoulder = cart_skel[skel_desc.joint_names.index('right_shoulder')] lshoulder = cart_skel[skel_desc.joint_names.index('left_shoulder')] pelvis = cart_skel[skel_desc.joint_names.index('pelvis')] v1 = rshoulder - pelvis v2 = lshoulder - pelvis forward = torch.cross(v1, v2) forward = forward / forward.norm(2) up = 0.5 * (v1 + v2) up = up / up.norm(2) right = torch.cross(forward, up) right = right / right.norm(2) up = torch.cross(forward, right) look_at = skel.new([ [right[0], up[0], forward[0], 0], [right[1], up[1], forward[1], 0], [right[2], up[2], forward[2], 0], [0, 0, 0, 1], ]) return torch.matmul(ensure_homogeneous(cart_skel, d=3), look_at)
def get_orig_skeleton(self, index): id = self.example_ids[index] original_skel = ensure_homogeneous(torch.from_numpy(self.joint_3d[id]), d=3) if self.skeleton_desc.canonical: if original_skel.size(-2) == H36MSkeletonDesc.n_joints: original_skel = h36m_to_canonical_skeleton(original_skel) else: raise Exception('unexpected number of joints: ' + original_skel.size(-2)) return original_skel
def denormalise_with_depth(self, normalised_skel, z_ref, intrinsics): """Transforms a normalised skeleton to denormalised form. Follow this up with point_transformer.untransform() to get a skeleton which is comparable with original_skel. """ return self.skeleton_normaliser.denormalise_skeleton( ensure_homogeneous(normalised_skel, d=3), z_ref, intrinsics, self.data_specs.input_specs.height, self.data_specs.input_specs.width)
def untransform(self, points: torch.DoubleTensor): points = ensure_homogeneous(points, d=3) if len(self.shuffle_indices) > 0: inv_shuffle_indices = list(range(len(self.shuffle_indices))) for i, j in enumerate(self.shuffle_indices): inv_shuffle_indices[j] = i index = torch.LongTensor(inv_shuffle_indices).unsqueeze( -1).expand_as(points) points = points.gather(-2, index) return torch.mm(points, self.matrix.inverse().t())
def do_training_pass(epoch, model, tel, loader, scheduler, on_progress): if hasattr(scheduler, 'step'): scheduler.step(epoch) optimiser = scheduler.optimizer vis_images = None samples_processed = 0 model.train() for batch in generator_timer(progress_iter(loader, 'Training'), tel['data_load_time']): if hasattr(scheduler, 'batch_step'): scheduler.batch_step() with timer(tel['data_transfer_time']): in_var = batch['input'].to(global_opts['device'], torch.float32) target_var = batch['target'].to(global_opts['device'], torch.float32) mask_var = batch['joint_mask'].to(global_opts['device'], torch.float32) # Calculate predictions and loss with timer(tel['forward_time']): out_var = model(in_var) loss = forward_loss(model, out_var, target_var, mask_var, batch['valid_depth']) tel['train_loss'].add(loss.sum().item()) # Calculate accuracy metrics with timer(tel['eval_time']): calculate_performance_metrics( batch, loader.dataset, ensure_homogeneous(out_var.to(CPU, torch.float64).detach(), d=3), tel['train_mpjpe'], tel['train_pck']) # Calculate gradients with timer(tel['backward_time']): optimiser.zero_grad() loss.backward() # Update parameters with timer(tel['optim_time']): optimiser.step() # Update progress samples_processed += len(batch['input']) on_progress(samples_processed) if vis_images is None: preds = out_var.to(CPU, torch.float64).detach() vis_images = visualise_predictions(preds, batch, loader.dataset) tel['train_examples'].set_value(vis_images[:8])
def test_denormalise_skeleton(self): denorm_skel = ensure_homogeneous(self.points.clone(), d=3) denorm_skel[:, :2] -= denorm_skel[ MPI3D_SKELETON_DESC.root_joint_id, :2] normaliser = SkeletonNormaliser() norm_skel = normaliser.normalise_skeleton(denorm_skel, self.z_ref, self.camera, 2048, 2048) recons_skel = normaliser.denormalise_skeleton(norm_skel, self.z_ref, self.camera, 2048, 2048) self.assertAlmostEqual(torch.dist(recons_skel, denorm_skel).item(), 0, delta=1e-4)
def test_normalise_skeleton(self): denorm_skel = ensure_homogeneous(self.points.clone(), d=3) denorm_skel[:, :2] -= denorm_skel[ MPI3D_SKELETON_DESC.root_joint_id, :2] normaliser = SkeletonNormaliser() norm_skel = normaliser.normalise_skeleton(denorm_skel, self.z_ref, self.camera, 2048, 2048) self.assertAlmostEqual(torch.dist( norm_skel[1], torch.DoubleTensor([0.0215, -0.1514, -0.0127, 1.0000])).item(), 0, delta=1e-4)
def obtain_predictions(model, device, loader, known_depth=False, print_progress=False): model.eval() iterable = loader if print_progress: iterable = tqdm(loader, leave=True, ascii=True) for batch in iterable: in_var = batch['input'].to(device, torch.float32) target_var = batch['target'].to(device, torch.float32) # Calculate predictions and loss start_time = perf_counter() out_var = model(in_var) inference_time = perf_counter() - start_time loss = average_loss( model.forward_3d_losses(out_var, target_var.narrow(-1, 0, 3))) norm_preds = ensure_homogeneous(out_var.to(CPU, torch.float64), d=3) actuals = [] expected = None for i, norm_pred in enumerate(norm_preds): expected_i, actual_i =\ prepare_for_3d_evaluation(batch['original_skel'][i], norm_pred, loader.dataset, batch['camera_intrinsic'][i], batch['transform_opts'][i], known_depth=known_depth) if expected is not None: assert (expected_i - expected).abs().gt(1e-6).sum() == 0,\ "Expected all examples in batch to have the same target" expected = expected_i actuals.append(actual_i) actual = torch.stack(actuals, 0).mean(0) try: frame_ref = batch['frame_ref'][0] except KeyError: frame_ref = None prediction = dict( expected=expected, actual=actual, frame_ref=frame_ref, inference_time=inference_time, loss=loss.sum().item(), ) yield prediction
def load_example(dataset, example_index): example = dataset[example_index] input = example['input'] input_image = dataset.input_to_pil_image(input) camera = example['camera_intrinsic'] transform_opts = example['transform_opts'] gt_skel = None if 'target' in example: gt_skel = dict(original=example['original_skel']) gt_skel_norm = ensure_homogeneous(example['target'], d=3) gt_skel_denorm = dataset.denormalise_with_skeleton_height(gt_skel_norm, camera, transform_opts) gt_skel['image_space'] = camera.project_cartesian(gt_skel_denorm) gt_skel['camera_space'] = dataset.untransform_skeleton(gt_skel_denorm, transform_opts) return dict( input=input, input_image=input_image, camera=camera, transform_opts=transform_opts, gt_skel=gt_skel, )
def _build_sample(self, index, orig_camera, orig_image, orig_skel, transform_opts, extrinsics): frame_ref = self.frame_refs[index] out_width = self.data_specs.input_specs.width out_height = self.data_specs.input_specs.height ctx = self.create_transformer_context(transform_opts) camera_int, img, joints3d = ctx.transform(orig_camera, orig_image, orig_skel) z_ref = joints3d[self.skeleton_desc.root_joint_id, 2] target = self.skeleton_normaliser.normalise_skeleton( joints3d, z_ref, camera_int, out_height, out_width) sample = { # Description of which video frame the example comes from 'frame_ref': frame_ref.to_dict(), 'index': index, # Index in the dataset 'valid_depth': 1, # "Original" data without transforms applied 'original_skel': ensure_homogeneous(orig_skel, d=3), # Universal scale # Transformed data 'camera_intrinsic': camera_int, 'camera_extrinsic': extrinsics, 'target': target, # Normalised target skeleton # Transformer data 'transform_opts': transform_opts, 'joint_mask': torch.ByteTensor(target.size(-2)).fill_(1), } if img: sample['input'] = self.input_to_tensor(img) return sample
def _build_sample(self, index, orig_camera, orig_image, orig_skel, transform_opts, transform_opts_big): frame_ref = self.frame_refs[index] # out_width = self.data_specs.input_specs.width # out_height = self.data_specs.input_specs.height if orig_skel.shape[0] != 17: canonical_original_skel = self._mpi_inf_3dhp_to_canonical_skeleton( ensure_homogeneous(orig_skel, d=3)).float() else: canonical_original_skel = ensure_homogeneous(orig_skel, d=3).float() ctx = self.create_transformer_context(transform_opts) _, img, _ = ctx.transform(image=orig_image) big_ctx = self.create_transformer_context(transform_opts_big) _, img_big, _ = big_ctx.transform(image=orig_image) sample = { 'index': index, # Index in the dataset 'original_skel': canonical_original_skel, 'camera_original': orig_camera.matrix[:, :-1].float(), 'original_img_shape': torch.FloatTensor(orig_image.size), } img_transform = transforms.Compose([transforms.ToTensor()]) if img: sample['input'] = self.input_to_tensor(img) if img_big: sample['input_big'] = self.input_to_tensor(img_big) sample['input_big_img'] = img_transform(img_big) # Generate the GT location and Scale of Crop """14 is the location of the hip in canonical skeleton!""" pelvis_joint = sample['original_skel'][14, :-1].unsqueeze( 0) #because of legacy code in utils that take a list of centers all_joints = sample['original_skel'][:, :-1] sample['world_coord_skel_mm'] = all_joints relative_joints = all_joints - pelvis_joint sample['non_normalized_3d'] = relative_joints #Normalize the Joints! normalized_joints = utils.batch_normalize_canon_human_joints( relative_joints.unsqueeze(0), mpi_3d_Mean, mpi_3d_Std).squeeze(0) sample['normalized_skel_mm'] = normalized_joints sample['pelvis_location_mm'] = pelvis_joint Ks_px = sample['camera_original'] K = Ks_px.clone() K[0, 2] = 0. K[1, 2] = 0. P_px = Ks_px.clone() pose_2d = utils.world_2_camera_coordinates(P_px, all_joints.float()) sample['pose2d_original'] = pose_2d sample['perspective_matrix'] = P_px if self.focal_diff != 0: Ks_px[0, 0] *= self.focal_diff Ks_px[1, 1] *= self.focal_diff sample['camera_original'] = Ks_px """generate_gt_scales_from2d""" if self.calculate_scale_from_2d: scale = utils.generate_gt_scales_from2d(pose_2d) square_scale = torch.tensor([torch.max(scale), torch.max(scale)]) else: scale = utils.generate_gt_scales( K, self.human_height, pelvis_joint, sample['original_img_shape'][0], sample['original_img_shape'][1]) # 2000 is the height in mm square_scale = scale.clone() square_scale_py = square_scale / sample['original_img_shape'] sample['stn_square_scale_py'] = square_scale_py location_2d3d = utils.generate_gt_location( P_px, pelvis_joint, sample['original_img_shape'][0], sample['original_img_shape'][1]) sample['crop_location_2d3d'] = location_2d3d # Location that is centered in the middle of the 2D pose (NOTE: not the same as the location calculation in 2D->3D) location = torch.FloatTensor([ (torch.max(pose_2d[:, 0]) + torch.min(pose_2d[:, 0])) / 2, (torch.max(pose_2d[:, 1]) + torch.min(pose_2d[:, 1])) / 2 ]) sample['crop_scale'] = torch.FloatTensor(scale) sample['crop_location'] = torch.FloatTensor(location) return sample
def project_cartesian(self, coords): coords = ensure_homogeneous(coords, d=3) return ensure_cartesian(self.project(coords), d=2)
def __getitem__(self, index): frame_ref = self.frame_refs[index] orig_skel = self.get_univ_skeleton(index) if self.without_image: orig_image = None img_w = img_h = 768 else: orig_image = Image.open( path.join(self.data_dir, frame_ref.image_file)) img_w, img_h = orig_image.size with open(path.join(self.data_dir, frame_ref.camera_file), 'r') as f: cam_cal = parse_camera_calibration(f)[frame_ref.camera_id] # Correct the camera to account for the fact that video frames were # stored at a lower resolution. orig_camera = cam_cal['intrinsics'].clone() old_w = cam_cal['image_width'] old_h = cam_cal['image_height'] orig_camera.scale_image(img_w / old_w, img_h / old_h) extrinsics = cam_cal['extrinsics'] # Bounding box details joints2d = homogeneous_to_cartesian( orig_camera.project(ensure_homogeneous(orig_skel, d=3))) min_x = joints2d[:, 0].min().item() max_x = joints2d[:, 0].max().item() min_y = joints2d[:, 1].min().item() max_y = joints2d[:, 1].max().item() bb_cx = (min_x + max_x) / 2 bb_cy = (min_y + max_y) / 2 bb_size = 1.5 * max(max_x - min_x, max_y - min_y) img_short_side = min(img_h, img_w) out_width = self.data_specs.input_specs.width out_height = self.data_specs.input_specs.height if self.multicrop: samples = [] for aug_hflip in [False, True]: for offset in [(0, 0), (-1, 0), (0, -1), (1, 0), (0, 1)]: aug_x = offset[0] * 8 aug_y = offset[1] * 8 transform_opts = { 'in_camera': orig_camera, 'in_width': img_w, 'in_height': img_h, 'centre_x': bb_cx + aug_x, 'centre_y': bb_cy + aug_y, 'rotation': 0, 'scale': bb_size / img_short_side, 'hflip_indices': self.skeleton_desc.hflip_indices, 'hflip': aug_hflip, 'out_width': out_width, 'out_height': out_height, 'brightness': 1, 'contrast': 1, 'saturation': 1, 'hue': 0, } samples.append( self._build_sample(index, orig_camera, orig_image, orig_skel, transform_opts, extrinsics)) return collate(samples) else: aug_bg = aug_ub = aug_lb = False aug_hflip = False aug_brightness = aug_contrast = aug_saturation = 1.0 aug_hue = 0.0 aug_x = aug_y = 0.0 aug_scale = 1.0 aug_rot = 0 if self.use_aug: if not self.disable_mask_aug: aug_bg = frame_ref.bg_augmentable and np.random.uniform( ) < 0.6 aug_ub = frame_ref.ub_augmentable and np.random.uniform( ) < 0.2 aug_lb = frame_ref.lb_augmentable and np.random.uniform( ) < 0.5 aug_hflip = np.random.uniform() < 0.5 if np.random.uniform() < 0.3: aug_brightness = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_contrast = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_saturation = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_hue = np.random.uniform(-0.1, 0.1) aug_x = np.random.uniform(-16, 16) aug_y = np.random.uniform(-16, 16) aug_scale = np.random.uniform(0.9, 1.1) if np.random.uniform() < 0.4: aug_rot = np.clip(np.random.normal(0, 30), -30, 30) if orig_image: if aug_bg: orig_image = augment_background( orig_image, Image.open( path.join(self.data_dir, frame_ref.bg_mask_file)), random_background()) if aug_ub: orig_image = augment_clothing( orig_image, Image.open( path.join(self.data_dir, frame_ref.ub_mask_file)), random_texture()) if aug_lb: orig_image = augment_clothing( orig_image, Image.open( path.join(self.data_dir, frame_ref.lb_mask_file)), random_texture()) transform_opts = { 'in_camera': orig_camera, 'in_width': img_w, 'in_height': img_h, 'centre_x': bb_cx + aug_x, 'centre_y': bb_cy + aug_y, 'rotation': aug_rot, 'scale': bb_size * aug_scale / img_short_side, 'hflip_indices': self.skeleton_desc.hflip_indices, 'hflip': aug_hflip, 'out_width': out_width, 'out_height': out_height, 'brightness': aug_brightness, 'contrast': aug_contrast, 'saturation': aug_saturation, 'hue': aug_hue, } return self._build_sample(index, orig_camera, orig_image, orig_skel, transform_opts, extrinsics)
def __getitem__(self, index): id = self.example_ids[index] if not self.without_image: orig_image = self._load_image(id) if orig_image: img_w, img_h = orig_image.size else: img_w = img_h = 1000 img_short_side = min(img_h, img_w) extrinsics = torch.eye(4).double() orig_camera = self.camera_intrinsics[id] orig_skel = self.get_orig_skeleton(index) # Bounding box details joints2d = homogeneous_to_cartesian( orig_camera.project(ensure_homogeneous(orig_skel, d=3))) min_x = joints2d[:, 0].min().item() max_x = joints2d[:, 0].max().item() min_y = joints2d[:, 1].min().item() max_y = joints2d[:, 1].max().item() bb_cx = (min_x + max_x) / 2 bb_cy = (min_y + max_y) / 2 bb_size = 1.5 * max(max_x - min_x, max_y - min_y) out_width = self.data_specs.input_specs.width out_height = self.data_specs.input_specs.height if self.multicrop: samples = [] for aug_hflip in [False, True]: for offset in [(0, 0), (-1, 0), (0, -1), (1, 0), (0, 1)]: aug_x = offset[0] * 8 aug_y = offset[1] * 8 transform_opts = { 'in_camera': orig_camera, 'in_width': img_w, 'in_height': img_h, 'centre_x': bb_cx + aug_x, 'centre_y': bb_cy + aug_y, 'rotation': 0, 'scale': bb_size / img_short_side, 'hflip_indices': self.skeleton_desc.hflip_indices, 'hflip': aug_hflip, 'out_width': out_width, 'out_height': out_height, 'brightness': 1, 'contrast': 1, 'saturation': 1, 'hue': 0, } samples.append(self._build_sample(index, orig_camera, orig_image, orig_skel, transform_opts, extrinsics, self.human_height, self.focal_diff)) return collate(samples) else: aug_hflip = False aug_brightness = aug_contrast = aug_saturation = 1.0 aug_hue = 0.0 aug_x = aug_y = 0.0 aug_scale = 1.0 aug_rot = 0 if self.use_aug: aug_hflip = np.random.uniform() < 0.5 if np.random.uniform() < 0.3: aug_brightness = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_contrast = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_saturation = np.random.uniform(0.8, 1.2) if np.random.uniform() < 0.3: aug_hue = np.random.uniform(-0.1, 0.1) aug_x = np.random.uniform(-16, 16) aug_y = np.random.uniform(-16, 16) aug_scale = np.random.uniform(0.9, 1.1) if np.random.uniform() < 0.4: aug_rot = np.clip(np.random.normal(0, 30), -30, 30) transform_opts = { 'in_camera': orig_camera, 'in_width': img_w, 'in_height': img_h, 'centre_x': bb_cx + aug_x, 'centre_y': bb_cy + aug_y, 'rotation': aug_rot, 'scale': bb_size * aug_scale / img_short_side, 'hflip_indices': self.skeleton_desc.hflip_indices, 'hflip': aug_hflip, 'out_width': out_width, 'out_height': out_height, 'brightness': aug_brightness, 'contrast': aug_contrast, 'saturation': aug_saturation, 'hue': aug_hue, } transform_opts_big = { 'in_camera': orig_camera, 'in_width': img_w, 'in_height': img_h, 'centre_x': bb_cx + aug_x, 'centre_y': bb_cy + aug_y, 'rotation': aug_rot, 'scale': bb_size * aug_scale / img_short_side, 'hflip_indices': self.skeleton_desc.hflip_indices, 'hflip': aug_hflip, 'out_width': self.img_big_size, 'out_height': self.img_big_size, 'brightness': aug_brightness, 'contrast': aug_contrast, 'saturation': aug_saturation, 'hue': aug_hue, } return self._build_sample(index, orig_camera, orig_image, orig_skel, transform_opts, transform_opts_big, extrinsics, self.human_height, self.focal_diff) else: #self.without_image == True orig_camera = self.camera_intrinsics[id] orig_skel = self.get_orig_skeleton(index) return self._build_sample_without_image(id, index, orig_camera, orig_skel, self.human_height, self.focal_diff)
def _build_sample_without_image(self, index, orig_skel, orig_camera, img_wh): frame_ref = self.frame_refs[index] if orig_skel.shape[0] != 17: canonical_original_skel = self._mpi_inf_3dhp_to_canonical_skeleton( ensure_homogeneous(orig_skel, d=3)).float() else: canonical_original_skel = ensure_homogeneous(orig_skel, d=3).float() Ks_px_video_cam = orig_camera.matrix[:, :-1].float().unsqueeze( 0) #originally was 2048 x 2048, need to resize to 768 x 768 img_w_h_orig = torch.FloatTensor([2048, 2048]).unsqueeze(0) img_w_h_small = torch.FloatTensor([img_wh[0], img_wh[1]]) Ks_px_image_cam = pcl_util.K_new_resolution_px( Ks_px_video_cam, img_w_h_orig, img_w_h_small).squeeze(0) sample = { 'index': index, # Index in the dataset 'original_skel': canonical_original_skel, # Transformed data 'camera_original': Ks_px_image_cam, 'original_img_shape': torch.FloatTensor(img_wh) } # Generate the GT location and Scale of Crop """HIP IS Position 14 in """ pelvis_joint = sample['original_skel'][14, :-1].unsqueeze( 0) #because of legacy code in utils that take a list of centers all_joints = sample['original_skel'][:, :-1] sample['world_coord_skel_mm'] = all_joints relative_joints = all_joints - pelvis_joint sample['non_normalized_3d'] = relative_joints #Normalize the Joints! normalized_joints = utils.batch_normalize_canon_human_joints( relative_joints.unsqueeze(0), mpi_3d_Mean, mpi_3d_Std).squeeze(0) sample['normalized_skel_mm'] = normalized_joints sample['pelvis_location_mm'] = pelvis_joint Ks_px = sample['camera_original'] K = Ks_px.clone() K[0, 2] = 0. K[1, 2] = 0. P_px = Ks_px.clone() pose_2d = utils.world_2_camera_coordinates(P_px, all_joints.float()) sample['pose2d_original'] = pose_2d sample['perspective_matrix'] = P_px if self.focal_diff != 0: Ks_px[0, 0] *= self.focal_diff Ks_px[1, 1] *= self.focal_diff sample['camera_original'] = Ks_px """generate_gt_scales_from2d""" if self.calculate_scale_from_2d: scale = utils.generate_gt_scales_from2d(pose_2d) square_scale = torch.tensor([torch.max(scale), torch.max(scale)]) else: scale = utils.generate_gt_scales( K, self.human_height, pelvis_joint, sample['original_img_shape'][0], sample['original_img_shape'][1]) # 2000 is the height in mm square_scale = scale.clone() square_scale_py = square_scale / sample['original_img_shape'] sample['stn_square_scale_py'] = square_scale_py location = utils.generate_gt_location(P_px, pelvis_joint, sample['original_img_shape'][0], sample['original_img_shape'][1]) sample['crop_scale'] = torch.FloatTensor(scale) sample['crop_location'] = torch.FloatTensor(location) if self.use_pcl: canon_label_2d_with_hip = pose_2d.unsqueeze(0) preprocess = pcl_preprocess(1, canon_label_2d_with_hip.shape[1], canon_label_2d_with_hip, sample['original_img_shape'].unsqueeze(0), \ sample['camera_original'].unsqueeze(0), location.unsqueeze(0), scale.unsqueeze(0),\ normalize=True, use_slant_compensation=self.use_slant_compensation) sample['preprocess-model_input'] = preprocess[ 'model_input'].squeeze(0) sample['preprocess-canon_virt_2d'] = preprocess[ 'canon_virt_2d'].squeeze(0) sample['preprocess-R_virt2orig'] = preprocess[ 'R_virt2orig'].squeeze(0) return sample