def create_instances(predictions, image_size): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > args.conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]) if score.shape[0] == 0: bbox = np.zeros((0, 4)) else: bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) labels = np.asarray( [dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes(bbox) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5): """ Post-process the output boxes for TensorMask. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will postprocess the raw outputs of TensorMask to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. Note that it does not contain the field `pred_masks`, which is provided by another input `result_masks`. result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results. The first item is a list of #detection tensors, each is the predicted masks. The second item is the anchors corresponding to the predicted masks. output_height, output_width: the desired output resolution. Returns: Instances: the postprocessed output from the model, based on the output resolution """ scale_x, scale_y = ( output_width / results.image_size[1], output_height / results.image_size[0], ) results = Instances((output_height, output_width), **results.get_fields()) output_boxes = results.pred_boxes output_boxes.tensor[:, 0::2] *= scale_x output_boxes.tensor[:, 1::2] *= scale_y output_boxes.clip(results.image_size) inds_nonempty = output_boxes.nonempty() results = results[inds_nonempty] result_masks, result_anchors = result_mask_info if result_masks: result_anchors.tensor[:, 0::2] *= scale_x result_anchors.tensor[:, 1::2] *= scale_y result_masks = [ x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i ] results.pred_masks = _paste_mask_lists_in_image( result_masks, result_anchors[inds_nonempty], results.image_size, threshold=mask_threshold, ) return results
def detector_postprocess(results, output_height, output_width, mask_threshold=0.5): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0]) results = Instances((output_height, output_width), **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): results.pred_masks = paste_masks_in_image( results.pred_masks[:, 0, :, :], # N, 1, M, M results.pred_boxes, results.image_size, threshold=mask_threshold, ) if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results
def postprocess(self, results, output_height, output_width, resized_in_h, resized_in_w, padded_im_h, padded_im_w): scale_x, scale_y = (output_width / resized_in_w, output_height / resized_in_h) # gather detection result to Instances results = Instances((output_height, output_width), **results.get_fields()) # scale detection box results from resized_padded_image space to source image space and clip output_boxes = results.pred_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) # filter empty detection in source image space results = results[output_boxes.nonempty()] if results.has("pred_global_logits"): mask_h, mask_w = results.pred_global_logits.shape[-2:] factor_h = padded_im_h // mask_h factor_w = padded_im_w // mask_w assert factor_h == factor_w factor = factor_h # aligned upsample instances mask to resized_padded_image shape pred_global_masks = aligned_bilinear( results.pred_global_logits.sigmoid(), factor) pred_global_masks = pred_global_masks[:, :, :resized_in_h, : resized_in_w] # scale mask from resized_image shape to source image shape # this is a inverse procedure of opencv or PIL interpolation # which align_corners is False pred_global_masks = F.interpolate(pred_global_masks, size=(output_height, output_width), mode="bilinear", align_corners=False) pred_global_masks = pred_global_masks[:, 0, :, :] # filter out the pred masks with low confidence score results.pred_masks = pred_global_masks > self.infer_mask_threshold return results
def inference_single_image(self, cate_preds, seg_preds, featmap_size, img_shape, ori_shape): """ Args: cate_preds, seg_preds: see: method: `inference`. featmap_size (list[tuple]): feature map size per level. img_shape (tuple): the size of the image fed into the model (height and width). ori_shape (tuple): original image shape (height and width). Returns: result (Instances): predicted results of single image after post-processing. """ assert len(cate_preds) == len(seg_preds) result = Instances(ori_shape) # overall info. h, w = img_shape upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) # process. inds = (cate_preds > self.score_threshold) # category scores. cate_scores = cate_preds[inds] if len(cate_scores) == 0: return result # category labels. inds = inds.nonzero(as_tuple=False) cate_labels = inds[:, 1] # strides. size_trans = cate_labels.new_tensor(self.seg_num_grids).pow(2).cumsum( 0) # [1600, 2896, 3472, 3728, 3872] strides = cate_scores.new_ones(size_trans[-1]) n_stage = len(self.seg_num_grids) strides[:size_trans[0]] *= self.feature_strides[0] for ind_ in range(1, n_stage): strides[size_trans[ind_ - 1]:size_trans[ind_]] *= self.feature_strides[ ind_] strides = strides[inds[:, 0]] # masks. seg_preds = seg_preds[inds[:, 0]] seg_masks = seg_preds > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: return result seg_masks = seg_masks[keep, ...] seg_preds = seg_preds[keep, ...] sum_masks = sum_masks[keep] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # mask scoring. seg_scores = (seg_preds * seg_masks.float()).sum((1, 2)) / sum_masks cate_scores *= seg_scores # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.nms_per_image: sort_inds = sort_inds[:self.nms_per_image] seg_masks = seg_masks[sort_inds, :, :] seg_preds = seg_preds[sort_inds, :, :] sum_masks = sum_masks[sort_inds] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] # Matrix NMS cate_scores = matrix_nms(seg_masks, cate_labels, cate_scores, kernel=self.nms_kernel, sigma=self.nms_sigma, sum_masks=sum_masks) # filter. keep = cate_scores >= self.update_threshold if keep.sum() == 0: return result seg_preds = seg_preds[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_detections_per_image: sort_inds = sort_inds[:self.max_detections_per_image] seg_preds = seg_preds[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] seg_preds = F.interpolate(seg_preds.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_preds, size=ori_shape, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold seg_masks = BitMasks(seg_masks) result.pred_masks = seg_masks result.pred_boxes = seg_masks.get_bounding_boxes() result.scores = cate_scores result.pred_classes = cate_labels return result
def inference_single_image(self, cate_preds, seg_preds_x, seg_preds_y, featmap_size, img_shape, ori_shape): result = Instances(ori_shape) # overall info. h, w = img_shape upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) # trans trans_diff. trans_size = torch.Tensor(self.seg_num_grids).pow(2).cumsum(0).long() trans_diff = torch.ones(trans_size[-1].item(), device=self.device).long() num_grids = torch.ones(trans_size[-1].item(), device=self.device).long() seg_size = torch.Tensor(self.seg_num_grids).cumsum(0).long() seg_diff = torch.ones(trans_size[-1].item(), device=self.device).long() strides = torch.ones(trans_size[-1].item(), device=self.device) n_stage = len(self.seg_num_grids) trans_diff[:trans_size[0]] *= 0 seg_diff[:trans_size[0]] *= 0 num_grids[:trans_size[0]] *= self.seg_num_grids[0] strides[:trans_size[0]] *= self.feature_strides[0] for ind_ in range(1, n_stage): trans_diff[trans_size[ind_ - 1]:trans_size[ind_]] *= trans_size[ind_ - 1] seg_diff[trans_size[ind_ - 1]:trans_size[ind_]] *= seg_size[ind_ - 1] num_grids[trans_size[ind_ - 1]:trans_size[ind_]] *= self.seg_num_grids[ ind_] strides[trans_size[ind_ - 1]:trans_size[ind_]] *= self.feature_strides[ ind_] # process. inds = (cate_preds > self.score_threshold) # category scores. cate_scores = cate_preds[inds] # category labels. inds = inds.nonzero(as_tuple=False) trans_diff = torch.index_select(trans_diff, dim=0, index=inds[:, 0]) seg_diff = torch.index_select(seg_diff, dim=0, index=inds[:, 0]) num_grids = torch.index_select(num_grids, dim=0, index=inds[:, 0]) strides = torch.index_select(strides, dim=0, index=inds[:, 0]) y_inds = (inds[:, 0] - trans_diff) // num_grids x_inds = (inds[:, 0] - trans_diff) % num_grids y_inds += seg_diff x_inds += seg_diff cate_labels = inds[:, 1] seg_masks_soft = seg_preds_x[x_inds, ...] * seg_preds_y[y_inds, ...] seg_masks = seg_masks_soft > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: return result seg_masks_soft = seg_masks_soft[keep, ...] seg_masks = seg_masks[keep, ...] cate_scores = cate_scores[keep] sum_masks = sum_masks[keep] cate_labels = cate_labels[keep] # mask scoring seg_score = (seg_masks_soft * seg_masks.float()).sum( (1, 2)) / sum_masks cate_scores *= seg_score if len(cate_scores) == 0: return result # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.nms_per_image: sort_inds = sort_inds[:self.nms_per_image] seg_masks_soft = seg_masks_soft[sort_inds, :, :] seg_masks = seg_masks[sort_inds, :, :] cate_scores = cate_scores[sort_inds] sum_masks = sum_masks[sort_inds] cate_labels = cate_labels[sort_inds] # Matrix NMS cate_scores = matrix_nms(seg_masks, cate_labels, cate_scores, kernel=self.nms_kernel, sigma=self.nms_sigma, sum_masks=sum_masks) # filter. keep = cate_scores >= self.update_threshold seg_masks_soft = seg_masks_soft[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_detections_per_image: sort_inds = sort_inds[:self.max_detections_per_image] seg_masks_soft = seg_masks_soft[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] seg_masks_soft = F.interpolate(seg_masks_soft.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_masks_soft, size=ori_shape, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold seg_masks = BitMasks(seg_masks) result.pred_masks = seg_masks result.pred_boxes = seg_masks.get_bounding_boxes() result.scores = cate_scores result.pred_classes = cate_labels return result