def test_get_bounding_box(self): masks = torch.tensor([ [ [False, False, False, True], [False, False, True, True], [False, True, True, False], [False, True, True, False], ], [ [False, False, False, False], [False, False, True, False], [False, True, True, False], [False, True, True, False], ], torch.zeros(4, 4), ]) bitmask = BitMasks(masks) box_true = torch.tensor([[1, 0, 4, 4], [1, 1, 3, 4], [0, 0, 0, 0]], dtype=torch.float32) box = bitmask.get_bounding_boxes() self.assertTrue(torch.all(box.tensor == box_true).item()) for box in box_true: poly = box[[0, 1, 2, 1, 2, 3, 0, 3]].numpy() mask = polygons_to_bitmask([poly], 4, 4) reconstruct_box = BitMasks( mask[None, :, :]).get_bounding_boxes()[0].tensor self.assertTrue(torch.all(box == reconstruct_box).item()) reconstruct_box = PolygonMasks([[poly] ]).get_bounding_boxes()[0].tensor self.assertTrue(torch.all(box == reconstruct_box).item())
def add_bitmasks(self, instances, im_h, im_w): for per_im_gt_inst in instances: if not per_im_gt_inst.has("gt_masks"): continue start = int(self.mask_out_stride // 2) if isinstance(per_im_gt_inst.get("gt_masks"), PolygonMasks): polygons = per_im_gt_inst.get("gt_masks").polygons per_im_bitmasks = [] per_im_bitmasks_full = [] for per_polygons in polygons: bitmask = polygons_to_bitmask(per_polygons, im_h, im_w) bitmask = torch.from_numpy(bitmask).to(self.device).float() start = int(self.mask_out_stride // 2) bitmask_full = bitmask.clone() bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride] assert bitmask.size(0) * self.mask_out_stride == im_h assert bitmask.size(1) * self.mask_out_stride == im_w per_im_bitmasks.append(bitmask) per_im_bitmasks_full.append(bitmask_full) per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0) per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0) else: # RLE format bitmask bitmasks = per_im_gt_inst.get("gt_masks").tensor h, w = bitmasks.size()[1:] # pad to new size bitmasks_full = F.pad(bitmasks, (0, im_w - w, 0, im_h - h), "constant", 0) bitmasks = bitmasks_full[:, start::self.mask_out_stride, start::self.mask_out_stride] per_im_gt_inst.gt_bitmasks = bitmasks per_im_gt_inst.gt_bitmasks_full = bitmasks_full
def process_annotation(self, ann, mask_side_len=28): # Parse annotation data img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0] height, width = img_info["height"], img_info["width"] gt_polygons = [ np.array(p, dtype=np.float64) for p in ann["segmentation"] ] gt_bbox = BoxMode.convert(ann["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width) # Run rasterize .. torch_gt_bbox = torch.tensor(gt_bbox).to(dtype=torch.float32).reshape( -1, 4) box_bitmasks = { "polygon": PolygonMasks([gt_polygons ]).crop_and_resize(torch_gt_bbox, mask_side_len)[0], "gridsample": rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox, mask_side_len), "roialign": BitMasks(torch.from_numpy( gt_bit_mask[None, :, :])).crop_and_resize( torch_gt_bbox, mask_side_len)[0], } # Run paste .. results = defaultdict(dict) for k, box_bitmask in box_bitmasks.items(): padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1) scaled_boxes = scale_boxes(torch_gt_bbox, scale) r = results[k] r["old"] = paste_mask_in_image_old(padded_bitmask[0], scaled_boxes[0], height, width, threshold=0.5) r["aligned"] = paste_masks_in_image(box_bitmask[None, :, :], Boxes(torch_gt_bbox), (height, width))[0] table = [] for rasterize_method, r in results.items(): for paste_method, mask in r.items(): mask = np.asarray(mask) iou = iou_between_full_image_bit_masks( gt_bit_mask.astype("uint8"), mask) table.append((rasterize_method, paste_method, iou)) return table
def get_single_image_pcd(plane_params, segmentations, height=480, width=640): plane_params = np.array(plane_params) offsets = np.maximum(np.linalg.norm(plane_params, ord=2, axis=1), 1e-5) norms = plane_params / offsets.reshape(-1, 1) if type(segmentations[0]) == dict: poly_segmentations = rle2polygon(segmentations) else: poly_segmentations = segmentations verts_list = [] for segm, normal, offset in zip(poly_segmentations, norms, offsets): if len(segm) == 0: verts_list.append(torch.tensor([[0, 0, 0]], dtype=torch.float32)) continue verts_3d = [] bitmask = polygons_to_bitmask(segm, height=height, width=width) verts = np.transpose(bitmask.nonzero()) verts_3d = get_pcd(verts[:, ::-1], normal, offset) verts_list.append(torch.tensor(verts_3d, dtype=torch.float32)) return verts_list
def add_bitmasks(self, instances, im_h, im_w): for per_im_gt_inst in instances: if not per_im_gt_inst.has("gt_masks"): continue polygons = per_im_gt_inst.get("gt_masks").polygons per_im_bitmasks = [] per_im_bitmasks_full = [] for per_polygons in polygons: bitmask = polygons_to_bitmask(per_polygons, im_h, im_w) bitmask = torch.from_numpy(bitmask).to(self.device).float() start = int(self.mask_out_stride // 2) bitmask_full = bitmask.clone() bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride] assert bitmask.size(0) * self.mask_out_stride == im_h assert bitmask.size(1) * self.mask_out_stride == im_w per_im_bitmasks.append(bitmask) per_im_bitmasks_full.append(bitmask_full) per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0) per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) # Old version # else: # assert mask_format == "bitmask", mask_format # masks = BitMasks.from_polygon_masks(polygons, *image_size) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": Instances * "sem_seg": semantic segmentation ground truth. * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`. See the return value of :func:`combine_semantic_and_instance_outputs` for its format. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, SIZE_DIVISIBILITY) score_sem, score_inst, score_conf = self.seg_model(images.tensor) h, w = images.tensor.size(2), images.tensor.size(3) score_inst = F.upsample(input=score_inst, size=(h, w), mode='bilinear') score_sem = F.upsample(input=score_sem, size=(h, w), mode='bilinear') score_conf_softmax = self.softmax_layer(score_conf) score_inst_sig = self.sigmoid_layer(score_inst) score_inst_sig_stuff = score_inst_sig[:, :BACKGROUND_NUM] score_inst_sig_thing = score_inst_sig[:, BACKGROUND_NUM:] if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors(gt_sem_seg, SIZE_DIVISIBILITY, IGNORE_LABEL_SEM).tensor else: gt_sem_seg = None if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] else: gt_instances = None #pdb.set_trace() if self.training: assert (gt_sem_seg - 1 < 0).sum() == 0 sem_seg_losses = self.criterion_sem(score_sem, gt_sem_seg - 1) gt_sem_seg[gt_sem_seg > BACKGROUND_NUM] = 0 gt_stuff = F.one_hot(gt_sem_seg, num_classes=BACKGROUND_NUM + 1).permute( 0, 3, 1, 2) gt_stuff = gt_stuff[:, 1:] num_inst = sum( [len(gt_instances[i]) for i in range(len(gt_instances))]) num_inst = torch.as_tensor([num_inst], dtype=torch.float, device=self.device) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_inst) num_inst = torch.clamp(num_inst / get_world_size(), min=1).item() loss_stuff_dice = 0. loss_thing_dice = 0. loss_stuff_focal = 0. loss_conf = 0. for i in range(len(batched_inputs)): gt_inst = gt_instances[i] gt_classes = gt_inst.gt_classes if gt_inst.has('gt_masks'): gt_masks = gt_inst.gt_masks masks = torch.stack([ torch.from_numpy( polygons_to_bitmask(poly, gt_inst.image_size[0], gt_inst.image_size[1])).to( self.device) for poly in gt_masks.polygons ], 0) masks_pad = masks.new_full( (masks.shape[0], images.tensor.shape[-2], images.tensor.shape[-1]), False) masks_pad[:, :masks.shape[-2], :masks.shape[-1]].copy_( masks) else: masks_pad = torch.zeros( [0, images.tensor.shape[-2], images.tensor.shape[-1]], dtype=torch.bool, device=self.device) row_ind, col_ind = MatchDice(score_inst_sig_thing[i:i + 1], torch.unsqueeze(masks_pad, 0), score_conf_softmax[i:i + 1], gt_classes) col_ind_empty = np.setdiff1d( np.arange(score_inst_sig_thing[i:i + 1].shape[1]), col_ind) score_inst_sig_perm = torch.cat( (score_inst_sig_stuff[i], score_inst_sig_thing[i, col_ind, :, :]), 0) target_inst_perm = torch.cat( (gt_stuff[i].float(), masks_pad[row_ind].float()), 0) loss_stuff_dice_tmp, loss_thing_dice_tmp = dice_loss( score_inst_sig_perm, target_inst_perm, num_inst, background_channels=BACKGROUND_NUM, valid_mask=None, sigmoid_clip=True) loss_stuff_dice += loss_stuff_dice_tmp loss_thing_dice += loss_thing_dice_tmp target_conf = gt_classes.new_full((score_conf.shape[1], ), FOREGROUND_NUM) target_conf[:len(gt_classes[row_ind])] = gt_classes[row_ind] loss_conf_tmp = conf_loss(torch.cat( (score_conf[i, col_ind], score_conf[i, col_ind_empty]), 0), target_conf.long(), neg_factor=10, neg_idx=FOREGROUND_NUM) loss_conf += loss_conf_tmp loss_stuff_focal_tmp = focal_loss(score_inst_sig_stuff[i], gt_stuff[i].float(), valid_mask=None, sigmoid_clip=True) loss_stuff_focal += loss_stuff_focal_tmp loss_stuff_focal = loss_stuff_focal / len(batched_inputs) loss_stuff_dice = loss_stuff_dice / len(batched_inputs) loss_conf = loss_conf / len(batched_inputs) loss_stuff_focal = loss_stuff_focal * 100. loss_conf = loss_conf * 5 losses = {} losses.update({"loss_sem_seg": sem_seg_losses}) losses.update({"loss_stuff_focal": loss_stuff_focal}) losses.update({"loss_stuff_dice": loss_stuff_dice}) losses.update({"loss_thing_dice": loss_thing_dice}) losses.update({"loss_conf": loss_conf}) return losses score_sem_null = score_sem.new_full( (score_sem.shape[0], 1, score_sem.shape[-2], score_sem.shape[-1]), -1000.) processed_results = [] for i in range(len(batched_inputs)): height = batched_inputs[i].get("height", images.image_sizes[i][0]) width = batched_inputs[i].get("width", images.image_sizes[i][1]) score_inst_sig_stuff_b = F.interpolate(score_inst_sig_stuff[ i:i + 1, :, :images.image_sizes[i][0], :images.image_sizes[i][1]], size=(height, width), mode="bilinear", align_corners=False) score_inst_sig_thing_b = F.interpolate(score_inst_sig_thing[ i:i + 1, :, :images.image_sizes[i][0], :images.image_sizes[i][1]], size=(height, width), mode="bilinear", align_corners=False) img_name = os.path.basename(batched_inputs[i]['file_name']) img_name_split = img_name.split('.') save_dir = '/home/yz9244/detectron2/output/vis_inst_sig' for j in range(80): pred_inst_tmp = np.asarray( 255 * (score_inst_sig_thing_b[0, j].cpu().numpy()), dtype=np.uint8) img = Image.fromarray(pred_inst_tmp) save_img = Image.new('RGB', (img.width, 2 * img.height)) img = Image.fromarray(pred_inst_tmp) save_img.paste(img, (0, 0)) pred_inst_tmp = np.asarray(255 * (pred_inst_tmp > 127), dtype=np.uint8) img = Image.fromarray(pred_inst_tmp) save_img.paste(img, (0, img.height)) save_img.save( os.path.join(save_dir, img_name_split[0] + '_%02d.png' % (j))) res = {} score_sem_foreground = torch.log( torch.exp(score_sem[i:i + 1, BACKGROUND_NUM:]).sum(dim=1, keepdim=True)) sem_seg_result = torch.cat( (score_sem_foreground, score_sem[i:i + 1, :BACKGROUND_NUM]), 1) sem_seg_r = sem_seg_postprocess(sem_seg_result[0], images.image_sizes[i], height, width) res.update({"sem_seg": sem_seg_r}) result = Instances((height, width)) inst_sem_id = torch.argmax(score_conf_softmax[i], dim=1) scores = score_conf_softmax[i, range(score_conf.shape[1]), inst_sem_id] scores = scores[inst_sem_id != FOREGROUND_NUM] pred_classes = inst_sem_id[inst_sem_id != FOREGROUND_NUM] pred_masks = score_inst_sig_thing_b[0, inst_sem_id != FOREGROUND_NUM] pred_mask_sum = torch.sum(pred_masks > 0.5, (1, 2)) result.pred_masks = pred_masks[pred_mask_sum > 0] > 0.5 result.pred_classes = pred_classes[pred_mask_sum > 0] result.scores = scores[pred_mask_sum > 0] box_tmp = torch.zeros(result.pred_masks.shape[0], 4) for j in range(result.pred_masks.shape[0]): nonzero_idx = torch.nonzero(result.pred_masks[j]) box_tmp[j, 0] = nonzero_idx[:, 1].min().item() box_tmp[j, 2] = nonzero_idx[:, 1].max().item() box_tmp[j, 1] = nonzero_idx[:, 0].min().item() box_tmp[j, 3] = nonzero_idx[:, 0].max().item() result.pred_boxes = Boxes(box_tmp) #detector_r = detector_postprocess(result, height, width) detector_r = result res.update({"instances": detector_r}) panoptic_r = combine_semantic_and_instance_outputs( result.scores, result.pred_classes, pred_masks[pred_mask_sum > 0], score_inst_sig_stuff_b[0]) res.update({"panoptic_seg": panoptic_r}) processed_results.append(res) return processed_results
def get_single_image_mesh_depth( depth, segmentations, img_file, height=480, width=640, webvis=True ): if type(segmentations[0]) == dict: poly_segmentations = rle2polygon(segmentations) else: poly_segmentations = segmentations verts_list = [] faces_list = [] verts_uvs = [] img_files = [] imgs = [] for segm in poly_segmentations: if len(segm) == 0: continue verts_3d = [] faces = [] uvs = [] bitmask = polygons_to_bitmask(segm, height=height, width=width) verts = np.transpose(bitmask.nonzero()) vert_id_map = defaultdict(dict) for idx, vert in enumerate(verts): vert_id_map[vert[0]][vert[1]] = idx + len(verts_3d) pcd = get_pcd_depth(verts[:, ::-1], depth.T) if webvis: # Rotate by 11 degree around x axis to push things on the ground. pcd = ( np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]]) @ np.array( [[1, 0, 0], [0, 0.9816272, -0.1908090], [0, 0.1908090, 0.9816272]] ) @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ pcd.T ).T triangles = [] for vert in verts: # upper right triangle if ( vert[0] < height - 1 and vert[1] < width - 1 and bitmask[vert[0]][vert[1] + 1] and bitmask[vert[0] + 1][vert[1] + 1] ): triangles.append( [ vert_id_map[vert[0]][vert[1]], vert_id_map[vert[0] + 1][vert[1] + 1], vert_id_map[vert[0]][vert[1] + 1], ] ) # bottom left triangle if ( vert[0] < height - 1 and vert[1] < width - 1 and bitmask[vert[0] + 1][vert[1]] and bitmask[vert[0] + 1][vert[1] + 1] ): triangles.append( [ vert_id_map[vert[0]][vert[1]], vert_id_map[vert[0] + 1][vert[1]], vert_id_map[vert[0] + 1][vert[1] + 1], ] ) triangles = np.array(triangles) verts_3d.extend(pcd) faces.extend(triangles) uvs.extend( np.array([0, 1]) + np.array([1, -1]) * verts[:, ::-1] / np.array([width, height]) ) verts_list.append(torch.tensor(verts_3d, dtype=torch.float32)) faces_list.append(torch.tensor(faces, dtype=torch.int32)) verts_uvs.append(torch.tensor(uvs, dtype=torch.float32)) img_files.append(img_file) imgs.append(torch.FloatTensor(imageio.imread(img_file))) verts_uvs = pad_sequence(verts_uvs, batch_first=True) faces_uvs = pad_sequence(faces_list, batch_first=True, padding_value=-1) tex = Textures(verts_uvs=verts_uvs, faces_uvs=faces_uvs, maps=imgs) # Initialise the mesh with textures meshes = Meshes(verts=verts_list, faces=faces_list, textures=tex) return meshes, img_files
def get_single_image_mesh( plane_params, segmentations, img_file, height=480, width=640, focal_length=517.97, webvis=False, reduce_size=True, ): plane_params = np.array(plane_params) offsets = np.linalg.norm(plane_params, ord=2, axis=1) norms = plane_params / offsets.reshape(-1, 1) if type(segmentations[0]) == dict: poly_segmentations = rle2polygon(segmentations) else: poly_segmentations = segmentations verts_list = [] faces_list = [] verts_uvs = [] img_files = [] imgs = [] for segm, normal, offset in zip(poly_segmentations, norms, offsets): if len(segm) == 0: continue verts_3d = [] faces = [] uvs = [] if reduce_size: for ring in segm: verts = np.array(ring).reshape(-1, 2) # get 3d pointcloud pcd = get_pcd(verts, normal, offset, focal_length) if webvis: # Rotate by 11 degree around x axis to push things on the ground. pcd = ( np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]]) @ np.array( [ [1, 0, 0], [0, 0.9816272, -0.1908090], [0, 0.1908090, 0.9816272], ] ) @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ pcd.T ).T # triangulate polygon using earcut algorithm triangles = earcut.triangulate_float32(verts, [len(verts)]) # add base index of vertice triangles += len(verts_3d) triangles = triangles.reshape(-1, 3) # convert to counter-clockwise triangles[:, [0, 2]] = triangles[:, [2, 0]] verts_3d.extend(pcd) faces.extend(triangles) uvs.extend( np.array([0, 1]) + np.array([1, -1]) * verts / np.array([width, height]) ) else: bitmask = polygons_to_bitmask(segm, height=height, width=width) verts = np.transpose(bitmask.nonzero()) vert_id_map = defaultdict(dict) for idx, vert in enumerate(verts): vert_id_map[vert[0]][vert[1]] = idx + len(verts_3d) verts_3d = get_pcd(verts[:, ::-1], normal, offset) if webvis: # Rotate by 11 degree around x axis to push things on the ground. verts_3d = ( np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]]) @ np.array( [ [1, 0, 0], [0, 0.9816272, -0.1908090], [0, 0.1908090, 0.9816272], ] ) @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ pcd.T ).T triangles = [] for vert in verts: # upper right triangle if ( vert[0] < height - 1 and vert[1] < width - 1 and bitmask[vert[0]][vert[1] + 1] and bitmask[vert[0] + 1][vert[1] + 1] ): triangles.append( [ vert_id_map[vert[0]][vert[1]], vert_id_map[vert[0] + 1][vert[1] + 1], vert_id_map[vert[0]][vert[1] + 1], ] ) # bottom left triangle if ( vert[0] < height - 1 and vert[1] < width - 1 and bitmask[vert[0] + 1][vert[1]] and bitmask[vert[0] + 1][vert[1] + 1] ): triangles.append( [ vert_id_map[vert[0]][vert[1]], vert_id_map[vert[0] + 1][vert[1]], vert_id_map[vert[0] + 1][vert[1] + 1], ] ) triangles = np.array(triangles) faces.extend(triangles) uvs.extend( np.array([0, 1]) + np.array([1, -1]) * verts[:, ::-1] / np.array([width, height]) ) verts_list.append(torch.tensor(verts_3d, dtype=torch.float32)) faces_list.append(torch.tensor(faces, dtype=torch.int32)) verts_uvs.append(torch.tensor(uvs, dtype=torch.float32)) img_files.append(img_file) imgs.append(torch.FloatTensor(imageio.imread(img_file))) verts_uvs = pad_sequence(verts_uvs, batch_first=True) faces_uvs = pad_sequence(faces_list, batch_first=True, padding_value=-1) tex = Textures(verts_uvs=verts_uvs, faces_uvs=faces_uvs, maps=imgs) # Initialise the mesh with textures meshes = Meshes(verts=verts_list, faces=faces_list, textures=tex) return meshes, img_files