def build_coord_volume(self, coord_volume_size, position, sizes, base_point, theta, axis, device): # build coord volume xxx, yyy, zzz = torch.meshgrid(torch.arange(coord_volume_size, device=device), torch.arange(coord_volume_size, device=device), torch.arange(coord_volume_size, device=device)) grid = torch.stack([xxx, yyy, zzz], dim=-1).type(torch.float) grid = grid.reshape((-1, 3)) grid_coord = torch.zeros_like(grid) grid_coord[:, 0] = position[0] + (sizes[0] / (coord_volume_size - 1)) * grid[:, 0] grid_coord[:, 1] = position[1] + (sizes[1] / (coord_volume_size - 1)) * grid[:, 1] grid_coord[:, 2] = position[2] + (sizes[2] / (coord_volume_size - 1)) * grid[:, 2] coord_volume = grid_coord.reshape(coord_volume_size, coord_volume_size, coord_volume_size, 3) center = torch.from_numpy(base_point).type(torch.float).to(device) # rotate coord_volume = coord_volume - center coord_volume = volumetric.rotate_coord_volume(coord_volume, theta, axis) coord_volume = coord_volume + center # transfer if self.transfer_cmu_to_human36m: # different world coordinates coord_volume = coord_volume.permute(0, 2, 1, 3) inv_idx = torch.arange(coord_volume.shape[1] - 1, -1, -1).long().to(device) coord_volume = coord_volume.index_select(1, inv_idx) return coord_volume
def forward(self, images, proj_matricies, batch): device = images.device batch_size, n_views = images.shape[:2] # reshape for backbone forward images = images.view(-1, *images.shape[2:]) # forward backbone heatmaps, features, _, vol_confidences = self.backbone(images) # reshape back images = images.view(batch_size, n_views, *images.shape[1:]) heatmaps = heatmaps.view(batch_size, n_views, *heatmaps.shape[1:]) features = features.view(batch_size, n_views, *features.shape[1:]) if vol_confidences is not None: vol_confidences = vol_confidences.view(batch_size, n_views, *vol_confidences.shape[1:]) # calcualte shapes image_shape, heatmap_shape = tuple(images.shape[3:]), tuple( heatmaps.shape[3:]) n_joints = heatmaps.shape[2] # norm vol confidences # 应该是用于反投影,不同的权重 if self.volume_aggregation_method == 'conf_norm': vol_confidences = vol_confidences / vol_confidences.sum( dim=1, keepdim=True) # change camera intrinsics new_cameras = deepcopy(batch['cameras']) for view_i in range(n_views): for batch_i in range(batch_size): # 将摄像机参数转换为heatmap下的参数 new_cameras[view_i][batch_i].update_after_resize( image_shape, heatmap_shape) proj_matricies = torch.stack( [ torch.stack([ torch.from_numpy(camera.projection) for camera in camera_batch ], dim=0) for camera_batch in new_cameras ], dim=0).transpose(1, 0) # shape (batch_size, n_views, 3, 4) proj_matricies = proj_matricies.float().to(device) # build coord volumes cuboids = [] base_points = torch.zeros(batch_size, 3, device=device) # coord_volumes 是反投影的对象 coord_volumes = torch.zeros(batch_size, self.volume_size, self.volume_size, self.volume_size, 3, device=device) for batch_i in range(batch_size): # 用于确定base_point: if self.use_gt_pelvis: # TODO 所以这里的keypoints真值应该是世界坐标系下的坐标 keypoints_3d = batch['keypoints_3d'][batch_i] else: keypoints_3d = batch['pred_keypoints_3d'][batch_i] # pelv 基准点 if self.kind == "coco": base_point = (keypoints_3d[11, :3] + keypoints_3d[12, :3]) / 2 elif self.kind == "mpii": base_point = keypoints_3d[6, :3] #摄像机坐标系 base_points[batch_i] = torch.from_numpy(base_point).to(device) # build cuboid, cuboid_side表示构建的立方体的size,往往比heamtmap更加精确,默认2500, # TODO 这个定义的2500应该是通过摄像机位置决定的,check摄像机参数 sides = np.array( [self.cuboid_side, self.cuboid_side, self.cuboid_side]) #所有的base_point减去新立方体的中心坐标 position = base_point - sides / 2 cuboid = volumetric.Cuboid3D(position, sides) cuboids.append(cuboid) # build coord volume, volume为由热图恢复的,默认64 xxx, yyy, zzz = torch.meshgrid( torch.arange(self.volume_size, device=device), torch.arange(self.volume_size, device=device), torch.arange(self.volume_size, device=device)) grid = torch.stack([xxx, yyy, zzz], dim=-1).type(torch.float) grid = grid.reshape((-1, 3)) grid_coord = torch.zeros_like(grid) # TODO 得到围绕position每个点在世界坐标系下的坐标 grid_coord[:, 0] = position[0] + (sides[0] / (self.volume_size - 1)) * grid[:, 0] grid_coord[:, 1] = position[1] + (sides[1] / (self.volume_size - 1)) * grid[:, 1] grid_coord[:, 2] = position[2] + (sides[2] / (self.volume_size - 1)) * grid[:, 2] coord_volume = grid_coord.reshape(self.volume_size, self.volume_size, self.volume_size, 3) # random rotation if self.training: theta = np.random.uniform(0.0, 2 * np.pi) else: theta = 0.0 if self.kind == "coco": axis = [0, 1, 0] # y axis elif self.kind == "mpii": axis = [0, 0, 1] # z axis center = torch.from_numpy(base_point).type(torch.float).to(device) # rotate coord_volume = coord_volume - center coord_volume = volumetric.rotate_coord_volume( coord_volume, theta, axis) coord_volume = coord_volume + center # transfer if self.transfer_cmu_to_human36m: # different world coordinates coord_volume = coord_volume.permute(0, 2, 1, 3) inv_idx = torch.arange(coord_volume.shape[1] - 1, -1, -1).long().to(device) coord_volume = coord_volume.index_select(1, inv_idx) coord_volumes[batch_i] = coord_volume # process features before unprojecting features = features.view(-1, *features.shape[2:]) # 特征层的通道重组 features = self.process_features(features) features = features.view(batch_size, n_views, *features.shape[1:]) # lift to volume volumes = op.unproject_heatmaps( features, proj_matricies, coord_volumes, volume_aggregation_method=self.volume_aggregation_method, vol_confidences=vol_confidences) # integral 3d volumes = self.volume_net(volumes) vol_keypoints_3d, volumes = op.integrate_tensor_3d_with_coordinates( volumes * self.volume_multiplier, coord_volumes, softmax=self.volume_softmax) return vol_keypoints_3d, features, volumes, vol_confidences, cuboids, coord_volumes, base_points
def forward(self, images, proj_matricies, batch): device = images.device batch_size, n_views = images.shape[:2] # reshape for backbone forward images = images.view(-1, *images.shape[2:]) # forward backbone heatmaps, features, _, vol_confidences = self.backbone(images) # reshape back images = images.view(batch_size, n_views, *images.shape[1:]) heatmaps = heatmaps.view(batch_size, n_views, *heatmaps.shape[1:]) features = features.view(batch_size, n_views, *features.shape[1:]) if vol_confidences is not None: vol_confidences = vol_confidences.view(batch_size, n_views, *vol_confidences.shape[1:]) # calcualte shapes image_shape, heatmap_shape = tuple(images.shape[3:]), tuple( heatmaps.shape[3:]) n_joints = heatmaps.shape[2] # norm vol confidences if self.volume_aggregation_method == 'conf_norm': vol_confidences = vol_confidences / vol_confidences.sum( dim=1, keepdim=True) # change camera intrinsics new_cameras = deepcopy(batch['cameras']) for view_i in range(n_views): for batch_i in range(batch_size): new_cameras[view_i][batch_i].update_after_resize( image_shape, heatmap_shape) proj_matricies = torch.stack( [ torch.stack([ torch.from_numpy(camera.projection) for camera in camera_batch ], dim=0) for camera_batch in new_cameras ], dim=0).transpose(1, 0) # shape (batch_size, n_views, 3, 4) proj_matricies = proj_matricies.float().to(device) # build coord volumes cuboids = [] base_points = torch.zeros(batch_size, 3, device=device) coord_volumes = torch.zeros(batch_size, self.volume_size, self.volume_size, self.volume_size, 3, device=device) for batch_i in range(batch_size): # if self.use_precalculated_pelvis: if self.use_gt_pelvis: keypoints_3d = batch['keypoints_3d'][batch_i] else: keypoints_3d = batch['pred_keypoints_3d'][batch_i] if self.kind == "coco": base_point = (keypoints_3d[11, :3] + keypoints_3d[12, :3]) / 2 elif self.kind == "mpii": base_point = keypoints_3d[6, :3] elif self.kind == "cmu": base_point = keypoints_3d[2, :3] base_points[batch_i] = torch.from_numpy(base_point).to(device) # build cuboid # NOTE: This is part of the paper where they build the cuboid used # for volumetric extrapolation from the pelvis sides = np.array( [self.cuboid_side, self.cuboid_side, self.cuboid_side]) position = base_point - sides / 2 cuboid = volumetric.Cuboid3D(position, sides) cuboids.append(cuboid) # build coord volume xxx, yyy, zzz = torch.meshgrid( torch.arange(self.volume_size, device=device), torch.arange(self.volume_size, device=device), torch.arange(self.volume_size, device=device)) grid = torch.stack([xxx, yyy, zzz], dim=-1).type(torch.float) grid = grid.reshape((-1, 3)) grid_coord = torch.zeros_like(grid) grid_coord[:, 0] = position[0] + (sides[0] / (self.volume_size - 1)) * grid[:, 0] grid_coord[:, 1] = position[1] + (sides[1] / (self.volume_size - 1)) * grid[:, 1] grid_coord[:, 2] = position[2] + (sides[2] / (self.volume_size - 1)) * grid[:, 2] coord_volume = grid_coord.reshape(self.volume_size, self.volume_size, self.volume_size, 3) # random rotation if self.training: theta = np.random.uniform(0.0, 2 * np.pi) else: theta = 0.0 if self.kind == "coco": axis = [0, 1, 0] # y axis elif self.kind in ("mpii", "cmu"): axis = [0, 0, 1] # z axis center = torch.from_numpy(base_point).type(torch.float).to(device) # rotate coord_volume = coord_volume - center coord_volume = volumetric.rotate_coord_volume( coord_volume, theta, axis) coord_volume = coord_volume + center # transfer if self.transfer_cmu_to_human36m or self.kind == "cmu": # different world coordinates coord_volume = coord_volume.permute(0, 2, 1, 3) inv_idx = torch.arange(coord_volume.shape[1] - 1, -1, -1).long().to(device) coord_volume = coord_volume.index_select(1, inv_idx) # print("Using different world coordinates") coord_volumes[batch_i] = coord_volume # process features before unprojecting features = features.view(-1, *features.shape[2:]) features = self.process_features(features) features = features.view(batch_size, n_views, *features.shape[1:]) # lift to volume volumes = op.unproject_heatmaps( features, proj_matricies, coord_volumes, volume_aggregation_method=self.volume_aggregation_method, vol_confidences=vol_confidences) # integral 3d volumes = self.volume_net(volumes) vol_keypoints_3d, volumes = op.integrate_tensor_3d_with_coordinates( volumes * self.volume_multiplier, coord_volumes, softmax=self.volume_softmax) return vol_keypoints_3d, features, volumes, vol_confidences, cuboids, coord_volumes, base_points