def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ # boxes is list[np.array] boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) # 创建一个实例集,保存一个图片所有对象的信息 # 设置属性gt_boxes target.gt_boxes = Boxes(boxes) classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) # 设置类别id target.gt_classes = classes # 下面是segmentation的 if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": # TODO check type and provide better error masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def get_empty_instance(h, w): inst = Instances((h, w)) inst.gt_boxes = Boxes(torch.rand(0, 4)) inst.gt_classes = torch.tensor([]).to(dtype=torch.int64) inst.gt_masks = BitMasks(torch.rand(0, h, w)) return inst
def get_class_masks_from_instances( instances, class_id=1, add_ignore=True, rend_size=REND_SIZE, bbox_expansion=BBOX_EXPANSION_FACTOR, min_confidence=0.0, image_size=IMAGE_SIZE, ): """ Gets occlusion-aware masks for a specific class index and additional metadata from PointRend instances. Args: instances: Detectron2 Instances with segmentation predictions. class_id (int): Object class id (using COCO dense ordering). add_ignore (bool): If True, adds occlusion-aware masking. rend_size (int): Mask size. bbox_expansion (float): Amount to pad the masks. This is important to prevent ignoring background pixels right outside the bounding box. min_confidence (float): Minimum confidence threshold for masks. Returns: keep_masks (N x rend_size x rend_size). keep_annotations (dict): "bbox": "class_id": "segmentation": "square_bbox": """ if len(instances) == 0: return [], [] instances = instances.to(torch.device("cpu:0")) boxes = instances.pred_boxes.tensor.numpy() class_ids = instances.pred_classes.numpy() scores = instances.scores.numpy() keep_ids = np.logical_and(class_ids == class_id, scores > min_confidence) bit_masks = BitMasks(instances.pred_masks) keep_annotations = [] keep_masks = [] full_boxes = torch.tensor([[0, 0, image_size, image_size]] * len(boxes)).float() full_sized_masks = bit_masks.crop_and_resize(full_boxes, image_size) for k in np.where(keep_ids)[0]: bbox = bbox_xy_to_wh(boxes[k]) square_bbox = make_bbox_square(bbox, bbox_expansion) square_boxes = torch.FloatTensor( np.tile(bbox_wh_to_xy(square_bbox), (len(instances), 1)) ) masks = bit_masks.crop_and_resize(square_boxes, rend_size).clone().detach() if add_ignore: ignore_mask = masks[0] for i in range(1, len(masks)): ignore_mask = ignore_mask | masks[i] ignore_mask = -ignore_mask.float().numpy() else: ignore_mask = np.zeros(rend_size, rend_size) m = ignore_mask.copy() mask = masks[k] m[mask] = mask[mask] keep_masks.append(m) keep_annotations.append( { "bbox": bbox, "class_id": class_ids[k], "mask": full_sized_masks[k], "score": scores[k], "square_bbox": square_bbox, } ) return keep_masks, keep_annotations
def process_inst(self, classes, scores, pred_inst, img_shape, ori_shape): """ Simple process generate prediction of Things. Args: classes: predicted classes of Things scores: predicted scores of Things pred_inst: predicted instances of Things img_shape: input image shape ori_shape: original image shape Returns: result_instance: preserved results for Things pred_mask: preserved binary masks for Things classes: preserved object classes scores: processed object scores """ pred_inst = pred_inst.sigmoid()[0] pred_mask = pred_inst > self.inst_thres # object rescore. sum_masks = pred_mask.sum((1, 2)).float() + 1e-6 seg_score = (pred_inst * pred_mask.float()).sum((1, 2)) / sum_masks scores *= seg_score print('scores: ', scores.shape) keep = torch.argsort(scores, descending=True) pred_inst = pred_inst[keep] pred_mask = pred_mask[keep] scores = scores[keep] classes = classes[keep] sum_masks = sum_masks[keep] print('keep: ', keep.shape) print('pred_inst: ', pred_inst.shape) print('pred_mask: ', pred_mask.shape) print('scores: ', scores.shape) print('classes: ', classes.shape) print('sum_masks: ', sum_masks.shape) # object score filter. keep = scores >= 0.05 print(keep) # print() if keep.sum() == 0: result_instance = Instances(ori_shape, pred_masks=[], pred_boxes=[], pred_classes=[], scores=[]) return result_instance, pred_mask, None, None pred_inst = pred_inst[keep] scores = scores[keep] classes = classes[keep] # sort and keep top_k keep = torch.argsort(scores, descending=True) keep = keep[:self.center_top_num] pred_inst = pred_inst[keep] scores = scores[keep].reshape(-1) classes = classes[keep].reshape(-1).to(torch.int32) pred_inst = F.interpolate(pred_inst.unsqueeze(0), scale_factor=self.common_stride, mode="bilinear", align_corners=False)[..., :img_shape[0], :img_shape[1]] pred_inst = F.interpolate(pred_inst, size=ori_shape, mode="bilinear", align_corners=False)[0] pred_mask = pred_inst > self.inst_thres pred_bitinst = BitMasks(pred_mask) result_instance = Instances(ori_shape, pred_masks=pred_bitinst, pred_boxes=pred_bitinst.get_bounding_boxes(), pred_classes=classes, scores=scores) return result_instance, pred_mask, classes, scores
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * "center": center points heatmap ground truth * "offset": pixel offsets to center points ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": see :func:`combine_semantic_and_instance_outputs` for its format. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] size_divisibility = self.backbone.size_divisibility images = ImageList.from_tensors(images, size_divisibility) features = self.backbone(images.tensor) losses = {} if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, size_divisibility, self.sem_seg_head.ignore_value).tensor if "sem_seg_weights" in batched_inputs[0]: # The default D2 DatasetMapper may not contain "sem_seg_weights" # Avoid error in testing when default DatasetMapper is used. weights = [ x["sem_seg_weights"].to(self.device) for x in batched_inputs ] weights = ImageList.from_tensors(weights, size_divisibility).tensor else: weights = None else: targets = None weights = None sem_seg_results, sem_seg_losses = self.sem_seg_head( features, targets, weights) losses.update(sem_seg_losses) if "center" in batched_inputs[0] and "offset" in batched_inputs[0]: center_targets = [ x["center"].to(self.device) for x in batched_inputs ] center_targets = ImageList.from_tensors( center_targets, size_divisibility).tensor.unsqueeze(1) center_weights = [ x["center_weights"].to(self.device) for x in batched_inputs ] center_weights = ImageList.from_tensors(center_weights, size_divisibility).tensor offset_targets = [ x["offset"].to(self.device) for x in batched_inputs ] offset_targets = ImageList.from_tensors(offset_targets, size_divisibility).tensor offset_weights = [ x["offset_weights"].to(self.device) for x in batched_inputs ] offset_weights = ImageList.from_tensors(offset_weights, size_divisibility).tensor else: center_targets = None center_weights = None offset_targets = None offset_weights = None center_results, offset_results, center_losses, offset_losses = self.ins_embed_head( features, center_targets, center_weights, offset_targets, offset_weights) losses.update(center_losses) losses.update(offset_losses) if self.training: return losses processed_results = [] for sem_seg_result, center_result, offset_result, input_per_image, image_size in zip( sem_seg_results, center_results, offset_results, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(sem_seg_result, image_size, height, width) c = sem_seg_postprocess(center_result, image_size, height, width) o = sem_seg_postprocess(offset_result, image_size, height, width) # Post-processing to get panoptic segmentation. panoptic_image, _ = get_panoptic_segmentation( r.argmax(dim=0, keepdim=True), c, o, thing_ids=self.meta.thing_dataset_id_to_contiguous_id.values(), label_divisor=self.meta.label_divisor, stuff_area=self.stuff_area, void_label=-1, threshold=self.threshold, nms_kernel=self.nms_kernel, top_k=self.top_k, ) # For semantic segmentation evaluation. processed_results.append({"sem_seg": r}) panoptic_image = panoptic_image.squeeze(0) semantic_prob = F.softmax(r, dim=0) # For panoptic segmentation evaluation. processed_results[-1]["panoptic_seg"] = (panoptic_image, None) # For instance segmentation evaluation. if self.predict_instances: instances = [] panoptic_image_cpu = panoptic_image.cpu().numpy() for panoptic_label in np.unique(panoptic_image_cpu): if panoptic_label == -1: continue pred_class = panoptic_label // self.meta.label_divisor isthing = pred_class in list( self.meta.thing_dataset_id_to_contiguous_id.values()) # Get instance segmentation results. if isthing: instance = Instances((height, width)) # Evaluation code takes continuous id starting from 0 instance.pred_classes = torch.tensor( [pred_class], device=panoptic_image.device) mask = panoptic_image == panoptic_label instance.pred_masks = mask.unsqueeze(0) # Average semantic probability sem_scores = semantic_prob[pred_class, ...] sem_scores = torch.mean(sem_scores[mask]) # Center point probability mask_indices = torch.nonzero(mask).float() center_y, center_x = ( torch.mean(mask_indices[:, 0]), torch.mean(mask_indices[:, 1]), ) center_scores = c[0, int(center_y.item()), int(center_x.item())] # Confidence score is semantic prob * center prob. instance.scores = torch.tensor( [sem_scores * center_scores], device=panoptic_image.device) # Get bounding boxes instance.pred_boxes = BitMasks( instance.pred_masks).get_bounding_boxes() instances.append(instance) if len(instances) > 0: processed_results[-1]["instances"] = Instances.cat( instances) return processed_results
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segm = [obj["segmentation"] for obj in annos] visible = [obj["visible_mask"] for obj in annos] invisible = [] for obj in annos: if "invisible_mask" in obj: invisible.append(obj["invisible_mask"]) else: invisible.append([[0.0,0.0,0.0,0.0,0.0,0.0]]) if mask_format == "polygon": # gt amodal masks per image a_masks = PolygonMasks(segm) # gt visible masks per image v_masks = PolygonMasks(visible) # gt invisible masks per image i_masks = PolygonMasks(invisible) else: assert mask_format == "bitmask", mask_format a_masks = [] v_masks = [] i_masks = [] for segm in segms: if isinstance(segm, list): # polygon a_masks.append(polygons_to_bitmask(segm, *image_size)) v_masks.append(polygons_to_bitmask(visible, *image_size)) i_masks.append(polygons_to_bitmask(invisible, *image_size)) elif isinstance(segm, dict): # COCO RLE a_masks.append(mask_util.decode(segm)) v_masks.append(mask_util.decode(visible)) i_masks.append(mask_util.decode(invisible)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array a_masks.append(segm) v_masks.append(visible) i_masks.append(invisible) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm)) ) # torch.from_numpy does not support array with negative stride. a_masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in a_masks]) ) v_masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in v_masks]) ) i_masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in i_masks]) ) # original mask head now is amodal mask head target.gt_masks = a_masks target.gt_v_masks = v_masks target.gt_i_masks = i_masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def process_inst_onnx(self, classes, scores, pred_inst, img_shape, vis=False): pred_inst = pred_inst.sigmoid()[0] pred_mask = pred_inst > self.inst_thres # object rescore. sum_masks = pred_mask.sum((1, 2)).float() + 1e-6 seg_score = (pred_inst * pred_mask.float()).sum((1, 2)) / sum_masks # scores *= seg_score scores = scores * seg_score # keep = torch.argsort(scores, descending=True) dim = 0 _, keep = torch.sort(scores, descending=True,dim=dim) pred_inst = pred_inst[keep] # pred_mask = pred_mask[keep] scores = scores[keep] classes = classes[keep] sum_masks = sum_masks[keep] print('keep: ', keep.shape) print('pred_inst: ', pred_inst.shape) print('pred_mask: ', pred_mask.shape) print('scores: ', scores.shape) print('classes: ', classes.shape) if vis: ori_shape = [720, 1280] # object score filter. keep = scores >= 0.05 if keep.sum() == 0: result_instance = Instances(ori_shape, pred_masks=[], pred_boxes=[], pred_classes=[], scores=[]) return {'instances': result_instance} pred_inst = pred_inst[keep] scores = scores[keep] classes = classes[keep] # sort and keep top_k keep = torch.argsort(scores, descending=True) keep = keep[:self.center_top_num] pred_inst = pred_inst[keep] scores = scores[keep].reshape(-1) classes = classes[keep].reshape(-1).to(torch.int32) pred_inst = F.interpolate(pred_inst.unsqueeze(0), scale_factor=self.common_stride, mode="bilinear", align_corners=False)[..., :img_shape[0], :img_shape[1]] if vis: pred_inst = F.interpolate(pred_inst, size=ori_shape, mode="bilinear", align_corners=False)[0] pred_mask = pred_inst > self.inst_thres pred_bitinst = BitMasks(pred_mask) result_instance = Instances(ori_shape, pred_masks=pred_bitinst, pred_boxes=pred_bitinst.get_bounding_boxes(), pred_classes=classes, scores=scores) return {"instances": result_instance} else: # let's visiualise the raw instances mask out (should be same output as TensorRT) # print('pred_inst shape: ', pred_inst.shape) # for i in pred_inst[0]: # import numpy as np # import cv2 # print(i.shape) # i = i.cpu().numpy() # print(i) # cv2.imshow('aa', i) # cv2.waitKey(0) return pred_inst, classes, scores
def __getitem__(self, idx): # Retrieve meta data of image img_data = self.meta_data[idx] # Load image path_img = os.path.join(self.root_dir, 'leftImg8bit', self.split, img_data['file_name'].split('_')[0], img_data['file_name'].replace('gtFine_', '')) image = np.asarray(Image.open(path_img)) # Get label info path_label = os.path.join(self.root_dir, 'gtFine', 'cityscapes_panoptic_'+self.split, img_data['labelfile_name']) panoptic = np.asarray(Image.open(path_label)) panoptic = rgb2id(panoptic) # Get bbox info rpn_bbox = [] class_bbox = [] for seg in img_data['segments_info']: seg_category = self.semantic_class_mapper[seg['category_id']] if seg_category['isthing']: rpn_bbox.append(seg["bbox"]) class_bbox.append(self.instance_class_mapper[seg['category_id']]) # Apply augmentation with albumentations if self.transform is not None: transformed = self.transform( image=image, mask=panoptic, bboxes=rpn_bbox, class_labels=class_bbox ) image = transformed['image'] panoptic = transformed['mask'] rpn_bbox = transformed['bboxes'] class_bbox = transformed['class_labels'] # Create instance class for detectron (Mask RCNN Head) instance = Instances(panoptic.shape) # Create semantic segmentation target with augmented data semantic = np.zeros_like(panoptic, dtype=np.long) rpn_mask = np.zeros_like(panoptic) instance_mask = [] instance_cls = [] for seg in img_data['segments_info']: seg_category = self.semantic_class_mapper[seg['category_id']] semantic[panoptic == seg["id"]] = seg_category['train_id'] # If segmentation is a thing generate a mask for maskrcnn target # Collect information for RPN targets if seg_category['isthing']: seg_category = self.instance_class_mapper[seg['category_id']] mask = np.zeros_like(panoptic) mask[panoptic == seg["id"]] = 1 #seg_category['train_id'] instance_cls.append(seg_category['train_id']) instance_mask.append(mask) # RPN targets rpn_mask[panoptic == seg["id"]] = 1 # Create same size of bbox and mask instance if len(rpn_bbox) > 0: rpn_bbox = coco_to_pascal_bbox(np.stack([*rpn_bbox])) instance.gt_masks = BitMasks(instance_mask) instance.gt_classes = torch.as_tensor(instance_cls) instance.gt_boxes = Boxes(rpn_bbox) else: instance.gt_masks = BitMasks(torch.Tensor([]).view(0,1,1)) instance.gt_classes = torch.as_tensor([]) instance.gt_boxes = Boxes([]) return { 'image': np.array(image), 'semantic': semantic, 'instance': instance, 'image_id': img_data['image_id'] }
def annotations_to_instances_with_attributes(annos, image_size, mask_format="polygon", load_attributes=False, max_attr_per_ins=16): """ Extend the function annotations_to_instances() to support attributes """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) if len(annos) and load_attributes: attributes = -torch.ones( (len(annos), max_attr_per_ins), dtype=torch.int64) for idx, anno in enumerate(annos): if "attribute_ids" in anno: for jdx, attr_id in enumerate(anno["attribute_ids"]): attributes[idx, jdx] = attr_id target.gt_attributes = attributes return target
def _desc_to_example(desc: Dict): # Detectron2 Model Input Format: # image: Tensor[C, H, W]; # height, width: output height and width; # instances: Instances Object to training, with the following fields: # "gt_boxes": # "gt_classes": # "gt_masks": a PolygonMasks or BitMasks object storing N masks, one for each instance. desc = copy.deepcopy(desc) # it will be modified by code below image_path = os.path.join(images_dir, f'{desc["image_id"]}.jpg') # shape: [H, W, C] origin_image = detection_utils.read_image(image_path, format="BGR") oh, ow, oc = origin_height, origin_width, origin_channels = origin_image.shape if augmentations is not None: aug_input = T.AugInput(origin_image) transforms = augmentations(aug_input) auged_image = aug_input.image else: auged_image = origin_image ah, aw, ac = auged_height, auged_width, auged_channels = auged_image.shape if not is_train: return { "image_id": desc['image_id'], # COCOEvaluator.process() need it. # expected shape: [C, H, W] "image": torch.as_tensor( np.ascontiguousarray(auged_image.transpose(2, 0, 1))), "height": auged_height, "width": auged_width, } target = Instances(image_size=(ah, aw)) if 'fill gt_boxes': # shape: n_box, 4 boxes_abs = np.array( [anno['bbox'] for anno in desc['annotations']]) if augmentations is not None: # clip transformed bbox to image size boxes_auged = transforms.apply_box( np.array(boxes_abs)).clip(min=0) boxes_auged = np.minimum( boxes_auged, np.array([aw, ah, aw, ah])[np.newaxis, :]) else: boxes_auged = boxes_abs target.gt_boxes = Boxes(boxes_auged) if 'fill gt_classes': classes = [anno['category_id'] for anno in desc['annotations']] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if 'fill gt_masks': mask_paths = [ os.path.join(masks_dir, f'{anno["mask_id"]}.png') for anno in desc['annotations'] ] masks = np.array( list( map( lambda p: cv2.resize(cv2.imread( p, flags=cv2.IMREAD_GRAYSCALE), dsize=(ow, oh)), mask_paths))) if augmentations is not None: masks_auged = np.array( list(map(lambda x: transforms.apply_segmentation(x), masks))) else: masks_auged = masks masks_auged = masks_auged > MASK_THRESHOLD masks_auged = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks_auged ])) target.gt_masks = masks_auged return { "image_id": desc['image_id'], # COCOEvaluator.process() need it. # expected shape: [C, H, W] "image": torch.as_tensor( np.ascontiguousarray(auged_image.transpose(2, 0, 1))), "height": auged_height, "width": auged_width, "instances": target, # refer: annotations_to_instances() }
def inference(self, box_cls, box_pred, mask_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. if self.use_focal_loss: prob = box_cls.sigmoid() # TODO make top-100 as an option for non-focal-loss as well scores, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1), 100, dim=1) topk_boxes = topk_indexes // box_cls.shape[2] labels = topk_indexes % box_cls.shape[2] else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, ( scores_per_image, labels_per_image, box_pred_per_image, image_size, ) in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) boxes = box_cxcywh_to_xyxy(box_pred_per_image) if self.use_focal_loss: boxes = torch.gather(boxes.unsqueeze(0), 1, topk_boxes.unsqueeze(-1).repeat( 1, 1, 4)).squeeze() result.pred_boxes = Boxes(boxes) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) if self.mask_on: mask = F.interpolate( mask_pred[i].unsqueeze(0), size=image_size, mode="bilinear", align_corners=False, ) mask = mask[0].sigmoid() > 0.5 B, N, H, W = mask_pred.shape mask = BitMasks(mask.cpu()).crop_and_resize( result.pred_boxes.tensor.cpu(), 32) result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segm = [obj["segmentation"] for obj in annos] # it may be bitmask instead of polygon visible_segm = [obj["visible_mask"] for obj in annos ] # it may be bitmask instead of polygon if mask_format == "polygon": masks = PolygonMasks(segm) if not isinstance(visible_segm[0], list): visible_masks = visible_segm visible_masks = BitMasks( torch.stack([torch.from_numpy(x) for x in visible_masks])) else: # visible_masks = BitMasks.from_polygon_masks(visible_polygons, *image_size) visible_masks = PolygonMasks(visible_segm) else: assert mask_format == "bitmask", mask_format if not isinstance(segm[0], list): masks = BitMasks( torch.stack([torch.from_numpy(x) for x in segm])) # visible_masks = visible_polygons # visible_masks = BitMasks(torch.stack([torch.from_numpy(x) for x in visible_masks])) else: masks = BitMasks.from_polygon_masks(segm, *image_size) # visible_masks = BitMasks.from_polygon_masks(visible_polygons, *image_size) # print('masks:{}'.format(polygons)) if not isinstance(visible_segm[0], list): visible_masks = visible_segm visible_masks = BitMasks( torch.stack([torch.from_numpy(x) for x in visible_masks])) else: # print('visible_masks:{}'.format(visible_polygons)) visible_masks = BitMasks.from_polygon_masks( visible_segm, *image_size) target.gt_masks = masks target.gt_visible_masks = visible_masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def read_from_ddict(self, ddict, inplace=True): """ test """ """ Read ground truth annotations from data dicts. """ """ Reads data dicts and stores the information as attributes of the InstanceSet object. The descriptions of the attributes are provided in the documentation for self.__init__(). Parameters ----------- ddict: list List of data dicts in format described below in Notes. inplace: bool If True, the object is modified in-place. Else, the InstanceSet object is returned. Returns ----------- self (optinal): InstanceSet only returned if inplace == False Notes ------ Data dicts should have the following format: -'file_name': str or Path object path to image corresponding to annotations -'mask_format': str 'polygonmask' if segmentation masks are lists of XY coordinates, or 'bitmask' if segmentation masks are RLE encoded segmentation masks -'height': int image height in pixels -'width': int image width in pixels -'annotations': list(dic) list of annotations. See the annotation format below. -'num_instances': int equal to len(annotations)- number of instances present in the image The dictionary format for the annotation dictionaries is as follows: -'category_id': int numeric class label for the instance. -'bbox_mode': detectron2.structures.BoxMode object describes the format of the bounding box coordinates. The default is BoxMode.XYXY_ABS. -'bbox': list(int) 4-element list of bbox coordinates -'segmentation': list list containing: - a list of polygon coordinates (mask format is polygonmasks) - dictionaries of RLE mask encodings (mask format is bitmasks) """ # default values-always set self.pred_or_gt = 'gt' # ddict assumed to be ground truth labels from get_ddict function # required values- function will error out if these are not set self.filepath = Path(ddict['file_name']) self.mask_format = ddict['mask_format'] image_size = (ddict['height'], ddict['width']) # instances_gt = annotations_to_instances(ddict['annotations'], image_size, self.mask_format) class_idx = np.asarray( [anno['category_id'] for anno in ddict['annotations']], np.int) bbox = np.stack([anno['bbox'] for anno in ddict['annotations']]) segs = [anno['segmentation'] for anno in ddict['annotations']] segtype = type(segs[0]) if segtype == dict: # RLE encoded mask masks = RLEMasks(segs) elif segtype == np.ndarray: if segs[0].dtype == np.bool: # bitmask masks = BitMasks(np.stack(segs)) else: # list of (list or array) of coords in format [x0,y0,x1,y1,...xn,yn] masks = PolygonMasks(segs) instances = Instances( image_size, **{ 'masks': masks, 'boxes': bbox, 'class_idx': class_idx }) self.instances = instances self.instances.colors = visualize.random_colors( len(instances), self.randomstate) # optional values- default to None if not in ddict self.dataset_class = ddict.get('dataset_class', None) HFW = ddict.get('HFW', None) HFW_units = None if HFW is not None: try: HFW = float(HFW) except ValueError: split = HFW.split(' ') if len(split) == 2: HFW = float(split[0]) HFW_units = split[1] self.HFW = HFW self.HFW_units = HFW_units if not inplace: return self return
def annotations_to_instances(annos, image_size, mask_format="polygon", max_num_planes=20): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of annotations, one per instance. image_size (tuple): height, width Returns: Instances: It will contains fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. """ boxes = [ BoxMode.convert(obj["bbox"], BoxMode(obj["bbox_mode"]), BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert ( segm.ndim == 2 ), "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "plane" in annos[0]: plane = [torch.tensor(obj["plane"]) for obj in annos] plane_idx = [torch.tensor([i]) for i in range(len(plane))] target.gt_planes = torch.stack(plane, dim=0) target.gt_plane_idx = torch.stack(plane_idx, dim=0) return target