def forward(self, x): for layer in self.blocks: x = F.relu(layer(x)) x = self.score_lowres(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def _forward_mask_point(self, features, mask_coarse_logits, instances): """ Forward logic of the mask point head. """ if not self.mask_point_on: return {} if self.training else mask_coarse_logits mask_features_list = [features[k] for k in self.mask_point_in_features] features_scales = [ self._feature_scales[k] for k in self.mask_point_in_features ] if self.training: proposal_boxes = [x.proposal_boxes for x in instances] gt_classes = cat([x.gt_classes for x in instances]) with torch.no_grad(): point_coords = get_uncertain_point_coords_with_randomness( mask_coarse_logits, lambda logits: calculate_uncertainty_ins_seg( logits, gt_classes), self.mask_point_train_num_points, self.mask_point_oversample_ratio, self.mask_point_importance_sample_ratio, ) fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features( mask_features_list, features_scales, proposal_boxes, point_coords) coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) point_logits = self.mask_point_head(fine_grained_features, coarse_features) return { "loss_mask_point": roi_mask_point_loss(point_logits, instances, point_coords_wrt_image) } else: pred_boxes = [x.pred_boxes for x in instances] pred_classes = cat([x.pred_classes for x in instances]) # The subdivision code will fail with the empty list of boxes if len(pred_classes) == 0: return mask_coarse_logits mask_logits = mask_coarse_logits.clone() for subdivions_step in range(self.mask_point_subdivision_steps): mask_logits = interpolate(mask_logits, scale_factor=2, mode="bilinear", align_corners=False) # If `mask_point_subdivision_num_points` is larger or equal to the # resolution of the next step, then we can skip this step H, W = mask_logits.shape[-2:] if (self.mask_point_subdivision_num_points >= 4 * H * W and subdivions_step < self.mask_point_subdivision_steps - 1): continue uncertainty_map = calculate_uncertainty_ins_seg( mask_logits, pred_classes) point_indices, point_coords = get_uncertain_point_coords_on_grid( uncertainty_map, self.mask_point_subdivision_num_points) fine_grained_features, _ = point_sample_fine_grained_features( mask_features_list, features_scales, pred_boxes, point_coords) coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) point_logits = self.mask_point_head(fine_grained_features, coarse_features) # put mask point predictions to the right places on the upsampled grid. R, C, H, W = mask_logits.shape point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) mask_logits = (mask_logits.reshape(R, C, H * W).scatter_( 2, point_indices, point_logits).view(R, C, H, W)) return mask_logits
def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: """ Extract predicted keypoint locations from heatmaps. Args: maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for each ROI and each keypoint. rois (Tensor): (#ROIs, 4). The box of each ROI. Returns: Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to (x, y, logit, score) for each keypoint. When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate, we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. """ offset_x = rois[:, 0] offset_y = rois[:, 1] widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() num_rois, num_keypoints = maps.shape[:2] xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4) width_corrections = widths / widths_ceil height_corrections = heights / heights_ceil keypoints_idx = torch.arange(num_keypoints, device=maps.device) for i in range(num_rois): outsize = (int(heights_ceil[i]), int(widths_ceil[i])) roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze( 0 ) # #keypoints x H x W # softmax over the spatial region max_score, _ = roi_map.view(num_keypoints, -1).max(1) max_score = max_score.view(num_keypoints, 1, 1) tmp_full_resolution = (roi_map - max_score).exp_() tmp_pool_resolution = (maps[i] - max_score).exp_() # Produce scores over the region H x W, but normalize with POOL_H x POOL_W, # so that the scores of objects of different absolute sizes will be more comparable roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True) w = roi_map.shape[2] pos = roi_map.view(num_keypoints, -1).argmax(1) x_int = pos % w y_int = (pos - x_int) // w assert ( roi_map_scores[keypoints_idx, y_int, x_int] == roi_map_scores.view(num_keypoints, -1).max(1)[0] ).all() x = (x_int.float() + 0.5) * width_corrections[i] y = (y_int.float() + 0.5) * height_corrections[i] xy_preds[i, :, 0] = x + offset_x[i] xy_preds[i, :, 1] = y + offset_y[i] xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int] xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int] return xy_preds