def evaluate_box_proposal(predictions, dic, limit=100, threshold=0.5, aspect_ratio_range=(0, 1 / 3)): gt_overlaps = [] num_pos = 0 anno = dic["annotations"] new_dic = [] gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_aspect_ratios = [ ratio_of_polygon(obj["segmentation"]) if not obj["iscrowd"] else ratio_of_bbox(obj["bbox"]) for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_aspect_ratios = torch.as_tensor(gt_aspect_ratios) if len(gt_boxes) == 0: return None predict_boxes = [ BoxMode.convert(prediction['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for prediction in predictions ] predict_boxes = torch.as_tensor(predict_boxes).reshape(-1, 4) predict_boxes = Boxes(predict_boxes) valid_gt_inds = (gt_aspect_ratios >= aspect_ratio_range[0]) & \ (gt_aspect_ratios <= aspect_ratio_range[1]) gt_boxes = gt_boxes[valid_gt_inds] if len(gt_boxes) == 0 or len(predictions) == 0: return None num_pos += len(gt_boxes) if limit is not None and len(predictions) > limit: predict_boxes = predict_boxes[:limit] overlaps = pairwise_iou(predict_boxes, gt_boxes) selected_gt = [anno[i] for i, bl in enumerate(valid_gt_inds) if bl] selected_pred = [] _gt_overlaps = torch.zeros(len(gt_boxes)) pred_classes = [] for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] overlaped_box_ind = overlaps[:, gt_ind] > threshold if overlaped_box_ind.sum() > 0: pred_classes += [ predictions[i]["category_id"] for i, bl in enumerate(overlaped_box_ind) if bl ] selected_pred += [ predictions[i] for i, bl in enumerate(overlaped_box_ind) if bl ] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps = _gt_overlaps gt_overlaps, _ = torch.sort(gt_overlaps) dic["annotations"] = selected_gt return selected_pred, dic
def _convert_xywh_to_xywha(self, x): return BoxMode.convert(x, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
def test_vis(): dset_name = sys.argv[1] assert dset_name in DatasetCatalog.list() meta = MetadataCatalog.get(dset_name) dprint("MetadataCatalog: ", meta) objs = meta.objs t_start = time.perf_counter() dicts = DatasetCatalog.get(dset_name) logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start)) dirname = "output/{}-data-vis".format(dset_name) os.makedirs(dirname, exist_ok=True) for d in dicts: img = read_image_cv2(d["file_name"], format="BGR") depth = mmcv.imread(d["depth_file"], "unchanged") / 10000.0 imH, imW = img.shape[:2] annos = d["annotations"] masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos] bboxes = [anno["bbox"] for anno in annos] bbox_modes = [anno["bbox_mode"] for anno in annos] bboxes_xyxy = np.array( [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)] ) kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos] quats = [anno["quat"] for anno in annos] transes = [anno["trans"] for anno in annos] Rs = [quat2mat(quat) for quat in quats] # 0-based label cat_ids = [anno["category_id"] for anno in annos] K = d["cam"] kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)] labels = [objs[cat_id] for cat_id in cat_ids] for _i in range(len(annos)): img_vis = vis_image_mask_bbox_cv2( img, masks[_i : _i + 1], bboxes=bboxes_xyxy[_i : _i + 1], labels=labels[_i : _i + 1] ) img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i]) xyz_path = annos[_i]["xyz_path"] xyz_info = mmcv.load(xyz_path) x1, y1, x2, y2 = xyz_info["xyxy"] xyz_crop = xyz_info["xyz_crop"].astype(np.float32) xyz = np.zeros((imH, imW, 3), dtype=np.float32) xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop xyz_show = get_emb_show(xyz) xyz_crop_show = get_emb_show(xyz_crop) img_xyz = img.copy() / 255.0 mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8") fg_idx = np.where(mask_xyz != 0) img_xyz[fg_idx[0], fg_idx[1], :] = xyz_show[fg_idx[0], fg_idx[1], :3] img_xyz_crop = img_xyz[y1 : y2 + 1, x1 : x2 + 1, :] img_vis_crop = img_vis[y1 : y2 + 1, x1 : x2 + 1, :] # diff mask diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1 : y2 + 1, x1 : x2 + 1] grid_show( [ img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpts2d[:, :, [2, 1, 0]], depth, # xyz_show, diff_mask_xyz, xyz_crop_show, img_xyz[:, :, [2, 1, 0]], img_xyz_crop[:, :, [2, 1, 0]], img_vis_crop, ], [ "img", "vis_img", "img_vis_kpts2d", "depth", "diff_mask_xyz", "xyz_crop_show", "img_xyz", "img_xyz_crop", "img_vis_crop", ], row=3, col=3, )
def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) anno = coco_api.loadAnns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno if obj["iscrowd"] == 0 ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def _convert_xy_to_wh(self, x): return BoxMode.convert(x, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
def combine_association(instance, association): pred_masks = [mask.numpy() for mask in instance.pred_masks] pred_scores = instance.scores.numpy() pred_boxes = instance.pred_boxes.tensor.numpy().tolist() pred_classes = instance.pred_classes.numpy() h, w = pred_masks[0].shape pred_associations = instance.pred_associations.numpy() pred_light = association.pred_light.tensor.numpy() ret = Instances((h, w)) ins = Instances((h, w)) if np.sum(pred_associations) == 0: ret.pred_boxes = association.pred_boxes ret.scores = association.scores ret.pred_classes = association.pred_classes ret.pred_light = association.pred_light.tensor.numpy().tolist() segm = np.zeros((h, w, 1), order='F', dtype='uint8') ret.pred_masks = [segm] * len(association.pred_boxes) ret.pred_associations = association.pred_associations.numpy().astype( 'int').tolist() instance.pred_associations = pred_associations.astype('int').tolist() return ret, instance mask_map = {} for i, ass in enumerate(pred_associations): if ass != 0: if ass in mask_map: if pred_classes[i] == 1: mask_map[ass].append((pred_masks[i], pred_scores[i], pred_classes[i], pred_boxes[i])) else: mask_map[ass] = [(pred_masks[i], pred_scores[i], pred_classes[i], pred_boxes[i]), mask_map[ass][0]] else: mask_map[ass] = [(pred_masks[i], pred_scores[i], pred_classes[i], pred_boxes[i])] results = [] boxes = [] scores = [] classes = [] associations = [] light = [] for i, ass in enumerate(association.pred_associations): if ass != 0: light.append(pred_light[i].tolist()) for k, v in mask_map.items(): associations.append(int(k)) s, o = v avg_score = float((s[1] + o[1]) / 2) _s = s[0].reshape(h, w, 1) _o = o[0].reshape(h, w, 1) comb = _s + _o classes.append(0) segm = encode(np.array(comb, order='F', dtype='uint8'))[0] boxes.append( BoxMode.convert(eval.maskUtils.toBbox(segm), BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)) results.append(comb) scores.append(avg_score) ret.pred_masks = results ret.pred_boxes = boxes ret.scores = scores ret.pred_classes = classes ret.pred_associations = associations ret.pred_light = light instance.pred_associations = instance.pred_associations.numpy().astype( 'int').tolist() return ret, instance
def transform_instance_annotations(annotation, transforms, image_size, *, keypoint_hflip_indices=None): """ Apply transforms to box, segmentation and keypoints annotations of a single instance. It will use `transforms.apply_box` for the box, and `transforms.apply_coords` for segmentation polygons & keypoints. If you need anything more specially designed for each data structure, you'll need to implement your own version of this function or the transforms. Args: annotation (dict): dict of instance annotations for a single instance. It will be modified in-place. transforms (TransformList or list[Transform]): image_size (tuple): the height, width of the transformed image keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. Returns: dict: the same input dict with fields "bbox", "segmentation", "keypoints" transformed according to `transforms`. The "bbox_mode" field will be set to XYXY_ABS. """ if isinstance(transforms, (tuple, list)): transforms = T.TransformList(transforms) # bbox is 1d (per-instance bounding box) bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) # clip transformed bbox to image size bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0) annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1]) annotation["bbox_mode"] = BoxMode.XYXY_ABS if "segmentation" in annotation: # each instance contains 1 or more polygons segm = annotation["segmentation"] if isinstance(segm, list): # polygons polygons = [np.asarray(p).reshape(-1, 2) for p in segm] annotation["segmentation"] = [ p.reshape(-1) for p in transforms.apply_polygons(polygons) ] elif isinstance(segm, dict): # RLE mask = mask_util.decode(segm) mask = transforms.apply_segmentation(mask) assert tuple(mask.shape[:2]) == image_size annotation["segmentation"] = mask else: raise ValueError( "Cannot transform segmentation of type '{}'!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict.".format(type(segm))) if "keypoints" in annotation: keypoints = transform_keypoint_annotations(annotation["keypoints"], transforms, image_size, keypoint_hflip_indices) annotation["keypoints"] = keypoints return annotation
pred_by_image[dic["image_id"]], img.shape[:2]) if not len(predictions) > 0: continue grouped_gt = vis.group_by(dic["annotations"], ratios, ratios_ranges) visualized = False for range_name in ratios_ranges.keys(): if not len(grouped_gt[range_name]) > 0: continue visualized = True vis = Visualizer(img, metadata, scale=scale) topk_boxes, topk_indices = vis.topk_iou_boxes( predictions.pred_boxes, Boxes([ BoxMode.convert(x["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for x in grouped_gt[range_name] ])) topk_indices = topk_indices.reshape((-1, )) # Transform indices to list since shape 1 tensors will be regarded as scalars. vis.draw_dataset_dict({"annotations": grouped_gt[range_name]}) vis_boxes = vis.draw_instance_predictions( predictions[topk_indices.tolist()]) if args.show: webcv2.imshow(basename + "-boxes@" + range_name, vis_boxes.get_image()[..., ::-1]) else: save(vis_boxes.get_image()[..., ::-1], args.output, "boxes", basename + "@%s.jpg" % range_name)
def test_vis(): dset_name = sys.argv[1] assert dset_name in DatasetCatalog.list() meta = MetadataCatalog.get(dset_name) dprint("MetadataCatalog: ", meta) objs = meta.objs t_start = time.perf_counter() dicts = DatasetCatalog.get(dset_name) logger.info("Done loading {} samples with {:.3f}s.".format( len(dicts), time.perf_counter() - t_start)) dirname = "output/{}-data-vis".format(dset_name) os.makedirs(dirname, exist_ok=True) for d in dicts: img = read_image_cv2(d["file_name"], format="BGR") depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0 imH, imW = img.shape[:2] annos = d["annotations"] masks = [ cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos ] bboxes = [anno["bbox"] for anno in annos] bbox_modes = [anno["bbox_mode"] for anno in annos] bboxes_xyxy = np.array([ BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes) ]) kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos] quats = [anno["quat"] for anno in annos] transes = [anno["trans"] for anno in annos] Rs = [quat2mat(quat) for quat in quats] # 0-based label cat_ids = [anno["category_id"] for anno in annos] K = d["cam"] kpts_2d = [ misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes) ] # # TODO: visualize pose and keypoints labels = [objs[cat_id] for cat_id in cat_ids] # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels) img_vis = vis_image_mask_bbox_cv2(img, masks, bboxes=bboxes_xyxy, labels=labels) img_vis_kpts2d = img.copy() for anno_i in range(len(annos)): img_vis_kpts2d = misc.draw_projected_box3d(img_vis_kpts2d, kpts_2d[anno_i]) grid_show( [ img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpts2d[:, :, [2, 1, 0]], depth ], [f"img:{d['file_name']}", "vis_img", "img_vis_kpts2d", "depth"], row=2, col=2, )
def evaluate_for_pix3d( predictions, dataset, metadata, filter_iou, mesh_models=None, iou_thresh=0.5, mask_thresh=0.5, device=None, vis_preds=False, ): from PIL import Image if device is None: device = torch.device("cpu") F1_TARGET = "[email protected]" # classes cat_ids = sorted(dataset.getCatIds()) reverse_id_mapping = { v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items() } # initialize tensors to record box & mask AP, number of gt positives box_apscores, box_aplabels = {}, {} mask_apscores, mask_aplabels = {}, {} mesh_apscores, mesh_aplabels = {}, {} npos = {} for cat_id in cat_ids: box_apscores[cat_id] = [ torch.tensor([], dtype=torch.float32, device=device) ] box_aplabels[cat_id] = [ torch.tensor([], dtype=torch.uint8, device=device) ] mask_apscores[cat_id] = [ torch.tensor([], dtype=torch.float32, device=device) ] mask_aplabels[cat_id] = [ torch.tensor([], dtype=torch.uint8, device=device) ] mesh_apscores[cat_id] = [ torch.tensor([], dtype=torch.float32, device=device) ] mesh_aplabels[cat_id] = [ torch.tensor([], dtype=torch.uint8, device=device) ] npos[cat_id] = 0.0 box_covered = [] mask_covered = [] mesh_covered = [] # number of gt positive instances per class for gt_ann in dataset.dataset["annotations"]: gt_label = gt_ann["category_id"] # examples with imgfiles = {img/table/1749.jpg, img/table/0045.png} # have a mismatch between images and masks. Thus, ignore image_file_name = dataset.loadImgs([gt_ann["image_id"] ])[0]["file_name"] if image_file_name in ["img/table/1749.jpg", "img/table/0045.png"]: continue npos[gt_label] += 1.0 for prediction in predictions: original_id = prediction["image_id"] image_width = dataset.loadImgs([original_id])[0]["width"] image_height = dataset.loadImgs([original_id])[0]["height"] image_size = [image_height, image_width] image_file_name = dataset.loadImgs([original_id])[0]["file_name"] # examples with imgfiles = {img/table/1749.jpg, img/table/0045.png} # have a mismatch between images and masks. Thus, ignore if image_file_name in ["img/table/1749.jpg", "img/table/0045.png"]: continue if "instances" not in prediction: continue num_img_preds = len(prediction["instances"]) if num_img_preds == 0: continue # predictions scores = prediction["instances"].scores boxes = prediction["instances"].pred_boxes.to(device) labels = prediction["instances"].pred_classes masks_rles = prediction["instances"].pred_masks_rle if hasattr(prediction["instances"], "pred_meshes"): meshes = prediction["instances"].pred_meshes # preditected meshes verts = [mesh[0] for mesh in meshes] faces = [mesh[1] for mesh in meshes] meshes = Meshes(verts=verts, faces=faces).to(device) else: meshes = ico_sphere(4, device) meshes = meshes.extend(num_img_preds).to(device) if hasattr(prediction["instances"], "pred_dz"): pred_dz = prediction["instances"].pred_dz heights = boxes.tensor[:, 3] - boxes.tensor[:, 1] # NOTE see appendix for derivation of pred dz pred_dz = pred_dz[:, 0] * heights.cpu() else: raise ValueError("Z range of box not predicted") assert prediction["instances"].image_size[0] == image_height assert prediction["instances"].image_size[1] == image_width # ground truth # anotations corresponding to original_id (aka coco image_id) gt_ann_ids = dataset.getAnnIds(imgIds=[original_id]) assert len( gt_ann_ids) == 1 # note that pix3d has one annotation per image gt_anns = dataset.loadAnns(gt_ann_ids)[0] assert gt_anns["image_id"] == original_id # get original ground truth mask, box, label & mesh maskfile = os.path.join(metadata.image_root, gt_anns["segmentation"]) with PathManager.open(maskfile, "rb") as f: gt_mask = torch.tensor( np.asarray(Image.open(f), dtype=np.float32) / 255.0) assert gt_mask.shape[0] == image_height and gt_mask.shape[ 1] == image_width gt_mask = (gt_mask > 0).to(dtype=torch.uint8) # binarize mask gt_mask_rle = [ mask_util.encode(np.array(gt_mask[:, :, None], order="F"))[0] ] gt_box = np.array(gt_anns["bbox"]).reshape(-1, 4) # xywh from coco gt_box = BoxMode.convert(gt_box, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) gt_label = gt_anns["category_id"] faux_gt_targets = Boxes( torch.tensor(gt_box, dtype=torch.float32, device=device)) # load gt mesh and extrinsics/intrinsics gt_R = torch.tensor(gt_anns["rot_mat"]).to(device) gt_t = torch.tensor(gt_anns["trans_mat"]).to(device) gt_K = torch.tensor(gt_anns["K"]).to(device) if mesh_models is not None: modeltype = gt_anns["model"] gt_verts, gt_faces = ( mesh_models[modeltype][0].clone(), mesh_models[modeltype][1].clone(), ) gt_verts = gt_verts.to(device) gt_faces = gt_faces.to(device) else: # load from disc raise NotImplementedError gt_verts = shape_utils.transform_verts(gt_verts, gt_R, gt_t) gt_zrange = torch.stack([gt_verts[:, 2].min(), gt_verts[:, 2].max()]) gt_mesh = Meshes(verts=[gt_verts], faces=[gt_faces]) # box iou boxiou = pairwise_iou(boxes, faux_gt_targets) # filter predictions with iou > filter_iou valid_pred_ids = boxiou > filter_iou # mask iou miou = mask_util.iou(masks_rles, gt_mask_rle, [0]) # # gt zrange (zrange stores min_z and max_z) # # zranges = torch.stack([gt_zrange] * len(meshes), dim=0) # predicted zrange (= pred_dz) assert hasattr(prediction["instances"], "pred_dz") # It's impossible to predict the center location in Z (=tc) # from the image. See appendix for more. tc = (gt_zrange[1] + gt_zrange[0]) / 2.0 # Given a center location (tc) and a focal_length, # pred_dz = pred_dz * box_h * tc / focal_length # See appendix for more. zranges = torch.stack( [ torch.stack([ tc - tc * pred_dz[i] / 2.0 / gt_K[0], tc + tc * pred_dz[i] / 2.0 / gt_K[0] ]) for i in range(len(meshes)) ], dim=0, ) gt_Ks = gt_K.view(1, 3).expand(len(meshes), 3) meshes = transform_meshes_to_camera_coord_system( meshes, boxes.tensor, zranges, gt_Ks, image_size) if vis_preds: vis_utils.visualize_predictions( original_id, image_file_name, scores, labels, boxes.tensor, masks_rles, meshes, metadata, "/tmp/output", ) shape_metrics = compare_meshes(meshes, gt_mesh, reduce=False) # sort predictions in descending order scores_sorted, idx_sorted = torch.sort(scores, descending=True) for pred_id in range(num_img_preds): # remember we only evaluate the preds that have overlap more than # iou_filter with the ground truth prediction if valid_pred_ids[idx_sorted[pred_id], 0] == 0: continue # map to dataset category id pred_label = reverse_id_mapping[labels[idx_sorted[pred_id]].item()] pred_miou = miou[idx_sorted[pred_id]].item() pred_biou = boxiou[idx_sorted[pred_id]].item() pred_score = scores[idx_sorted[pred_id]].view(1).to(device) # note that metrics returns f1 in % (=x100) pred_f1 = shape_metrics[F1_TARGET][ idx_sorted[pred_id]].item() / 100.0 # mask tpfp = torch.tensor([0], dtype=torch.uint8, device=device) if ((pred_label == gt_label) and (pred_miou > iou_thresh) and (original_id not in mask_covered)): tpfp[0] = 1 mask_covered.append(original_id) mask_apscores[pred_label].append(pred_score) mask_aplabels[pred_label].append(tpfp) # box tpfp = torch.tensor([0], dtype=torch.uint8, device=device) if ((pred_label == gt_label) and (pred_biou > iou_thresh) and (original_id not in box_covered)): tpfp[0] = 1 box_covered.append(original_id) box_apscores[pred_label].append(pred_score) box_aplabels[pred_label].append(tpfp) # mesh tpfp = torch.tensor([0], dtype=torch.uint8, device=device) if ((pred_label == gt_label) and (pred_f1 > iou_thresh) and (original_id not in mesh_covered)): tpfp[0] = 1 mesh_covered.append(original_id) mesh_apscores[pred_label].append(pred_score) mesh_aplabels[pred_label].append(tpfp) # check things for eval # assert npos.sum() == len(dataset.dataset["annotations"]) # convert to tensors pix3d_metrics = {} boxap, maskap, meshap = 0.0, 0.0, 0.0 valid = 0.0 for cat_id in cat_ids: cat_name = dataset.loadCats([cat_id])[0]["name"] if npos[cat_id] == 0: continue valid += 1 cat_box_ap = VOCap.compute_ap(torch.cat(box_apscores[cat_id]), torch.cat(box_aplabels[cat_id]), npos[cat_id]) boxap += cat_box_ap pix3d_metrics["box_ap@%.1f - %s" % (iou_thresh, cat_name)] = cat_box_ap cat_mask_ap = VOCap.compute_ap(torch.cat(mask_apscores[cat_id]), torch.cat(mask_aplabels[cat_id]), npos[cat_id]) maskap += cat_mask_ap pix3d_metrics["mask_ap@%.1f - %s" % (iou_thresh, cat_name)] = cat_mask_ap cat_mesh_ap = VOCap.compute_ap(torch.cat(mesh_apscores[cat_id]), torch.cat(mesh_aplabels[cat_id]), npos[cat_id]) meshap += cat_mesh_ap pix3d_metrics["mesh_ap@%.1f - %s" % (iou_thresh, cat_name)] = cat_mesh_ap pix3d_metrics["box_ap@%.1f" % iou_thresh] = boxap / valid pix3d_metrics["mask_ap@%.1f" % iou_thresh] = maskap / valid pix3d_metrics["mesh_ap@%.1f" % iou_thresh] = meshap / valid # print test ground truth vis_utils.print_instances_class_histogram( [npos[cat_id] for cat_id in cat_ids], # number of instances [dataset.loadCats([cat_id])[0]["name"] for cat_id in cat_ids], # class names pix3d_metrics, ) return pix3d_metrics
def instances_to_coco_json(instances, img_id, input_format): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() has_exts = instances.has("ext_points") if has_exts: exts = instances.ext_points.tensor.numpy().tolist() has_mask = instances.has("pred_masks") has_poly = instances.has("pred_polys") if has_mask or has_poly: if has_poly: output_height = instances.image_size[0] output_width = instances.image_size[1] rles = get_polygon_rles(instances.pred_polys.flatten(), (output_height, output_width)) else: if input_format == 'rle': # input is directly in rle format from polygons rles = instances.pred_masks else: rles = [ mask_util.encode( np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], } if has_mask or has_poly: result["segmentation"] = rles[k] if has_exts: result["extreme_points"] = exts[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segm = [obj["segmentation"] for obj in annos] # it may be bitmask instead of polygon visible_segm = [obj["visible_mask"] for obj in annos ] # it may be bitmask instead of polygon if mask_format == "polygon": masks = PolygonMasks(segm) if not isinstance(visible_segm[0], list): visible_masks = visible_segm visible_masks = BitMasks( torch.stack([torch.from_numpy(x) for x in visible_masks])) else: # visible_masks = BitMasks.from_polygon_masks(visible_polygons, *image_size) visible_masks = PolygonMasks(visible_segm) else: assert mask_format == "bitmask", mask_format if not isinstance(segm[0], list): masks = BitMasks( torch.stack([torch.from_numpy(x) for x in segm])) # visible_masks = visible_polygons # visible_masks = BitMasks(torch.stack([torch.from_numpy(x) for x in visible_masks])) else: masks = BitMasks.from_polygon_masks(segm, *image_size) # visible_masks = BitMasks.from_polygon_masks(visible_polygons, *image_size) # print('masks:{}'.format(polygons)) if not isinstance(visible_segm[0], list): visible_masks = visible_segm visible_masks = BitMasks( torch.stack([torch.from_numpy(x) for x in visible_masks])) else: # print('visible_masks:{}'.format(visible_polygons)) visible_masks = BitMasks.from_polygon_masks( visible_segm, *image_size) target.gt_masks = masks target.gt_visible_masks = visible_masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def transform_instance_annotations(annotation, transforms, image_size, *, keypoint_hflip_indices=None): """ Apply transforms to box, segmentation and keypoints annotations of a single instance. It will use `transforms.apply_box` for the box, and `transforms.apply_coords` for segmentation polygons & keypoints. If you need anything more specially designed for each data structure, you'll need to implement your own version of this function or the transforms. Args: annotation (dict): dict of instance annotations for a single instance. It will be modified in-place. transforms (TransformList): image_size (tuple): the height, width of the transformed image keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. Returns: dict: the same input dict with fields "bbox", "segmentation", "keypoints" transformed according to `transforms`. The "bbox_mode" field will be set to XYXY_ABS. """ bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) # Note that bbox is 1d (per-instance bounding box) annotation["bbox"] = transforms.apply_box([bbox])[0] annotation["bbox_mode"] = BoxMode.XYXY_ABS if "segmentation" in annotation: if isinstance(annotation['segmentation'], dict): mask = mask_utils.decode(annotation['segmentation']) for transform in transforms.transforms: mask = transform.apply_image(mask) annotation['segmentation'] = mask else: # each instance contains 1 or more polygons polygons = [ np.asarray(p).reshape(-1, 2) for p in annotation["segmentation"] ] annotation["segmentation"] = [ p.reshape(-1) for p in transforms.apply_polygons(polygons) ] if "visible_mask" in annotation: if isinstance(annotation['visible_mask'], dict): mask = mask_utils.decode(annotation['visible_mask']) for transform in transforms.transforms: mask = transform.apply_image(mask) annotation['visible_mask'] = mask else: # each instance contains 1 or more polygons polygons = [ np.asarray(p).reshape(-1, 2) for p in annotation["visible_mask"] ] annotation["visible_mask"] = [ p.reshape(-1) for p in transforms.apply_polygons(polygons) ] if "keypoints" in annotation: keypoints = transform_keypoint_annotations(annotation["keypoints"], transforms, image_size, keypoint_hflip_indices) annotation["keypoints"] = keypoints return annotation
def convert_to_dict_list(image_root, id_map, imgs, anns, dataset_name=None): num_instances_without_valid_segmentation = 0 num_instances_without_valid_bounding_box = 0 dataset_dicts = [] count_ignore_image_root_warning = 0 for (img_dict, anno_dict_list) in zip(imgs, anns): record = {} # NOTE: besides using (relative path) in the "file_name" filed to represent # the image resource, "extended coco" also supports using uri which # represents an image using a single string, eg. "everstore_handle://xxx", if "://" not in img_dict["file_name"]: record["file_name"] = os.path.join(image_root, img_dict["file_name"]) else: if image_root is not None: count_ignore_image_root_warning += 1 if count_ignore_image_root_warning == 1: logger.warning( ( "Found '://' in file_name: {}, ignore image_root: {}" "(logged once per dataset)." ).format(img_dict["file_name"], image_root) ) record["file_name"] = img_dict["file_name"] if "height" in img_dict or "width" in img_dict: record["height"] = img_dict["height"] record["width"] = img_dict["width"] image_id = record["image_id"] = img_dict["id"] objs = [] for anno in anno_dict_list: # Check that the image_id in this annotation is the same. This fails # only when the data parsing logic or the annotation file is buggy. assert anno["image_id"] == image_id assert anno.get("ignore", 0) == 0 obj = { field: anno[field] # NOTE: maybe use MetadataCatalog for this for field in ["iscrowd", "bbox", "bbox_mode", "keypoints", "category_id", "extras"] if field in anno } bbox_object = obj.get("bbox", None) if bbox_object is not None and "bbox_mode" in obj: bbox_object = BoxMode.convert(bbox_object, obj["bbox_mode"], BoxMode.XYWH_ABS) if "width" in record and "height" in record and (not valid_bbox(bbox_object, record["width"], record["height"])): num_instances_without_valid_bounding_box += 1 continue if obj.get("category_id", None) not in id_map: continue segm = anno.get("segmentation", None) if segm: # either list[list[float]] or dict(RLE) if not isinstance(segm, dict): # filter out invalid polygons (< 3 points) segm = [ poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6 ] if len(segm) == 0: num_instances_without_valid_segmentation += 1 continue # ignore this instance obj["segmentation"] = segm if "bbox_mode" not in obj: if len(obj["bbox"]) == 5: obj["bbox_mode"] = BoxMode.XYWHA_ABS else: obj["bbox_mode"] = BoxMode.XYWH_ABS if id_map: obj["category_id"] = id_map[obj["category_id"]] objs.append(obj) record["annotations"] = objs if len(objs) == 0: continue if dataset_name is not None: record["dataset_name"] = dataset_name dataset_dicts.append(record) if count_ignore_image_root_warning > 0: logger.warning( "The 'ignore image_root: {}' warning occurred {} times".format( image_root, count_ignore_image_root_warning ) ) if num_instances_without_valid_segmentation > 0: logger.warning( "Filtered out {} instances without valid segmentation. " "There might be issues in your dataset generation process.".format( num_instances_without_valid_segmentation ) ) if num_instances_without_valid_bounding_box > 0: logger.warning( "Filtered out {} instances without valid bounding boxes. " "There might be issues in your dataset generation process.".format( num_instances_without_valid_bounding_box ) ) return dataset_dicts
def draw_dataset_dict(self, dic, category=None): """ Draw annotations/segmentaions in Detectron2 Dataset format. Args: dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. category: the integer category for the desired annotation to display as a list or None if all of them Returns: output (VisImage): image object with visualizations. """ # start additional code unfiltered_annos = dic.get("annotations", None) if category == None: annos = unfiltered_annos else: annos = [] for annotations in unfiltered_annos: if annotations["category_id"] in category: annos.append(annotations) # end additional code if annos: if "segmentation" in annos[0]: masks = [x["segmentation"] for x in annos] else: masks = None if "keypoints" in annos[0]: keypts = [x["keypoints"] for x in annos] keypts = np.array(keypts).reshape(len(annos), -1, 3) else: keypts = None boxes = [ BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) for x in annos ] labels = [x["category_id"] for x in annos] colors = None if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get( "thing_colors"): colors = [ self._jitter( [x / 255 for x in self.metadata.thing_colors[c]]) for c in labels ] names = self.metadata.get("thing_classes", None) if names: labels = [names[i] for i in labels] labels = [ "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "") for i, a in zip(labels, annos) ] self.overlay_instances(labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors, alpha=1.0) sem_seg = dic.get("sem_seg", None) if sem_seg is None and "sem_seg_file_name" in dic: with PathManager.open(dic["sem_seg_file_name"], "rb") as f: sem_seg = Image.open(f) sem_seg = np.asarray(sem_seg, dtype="uint8") if sem_seg is not None: self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) return self.output
def draw_dataset_dict(self, dic): """ Draw annotations/segmentaions in Detectron2 Dataset format. Args: dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. Returns: output (VisImage): image object with visualizations. """ annos = dic.get("annotations", None) if annos: if "segmentation" in annos[0]: masks = [x["segmentation"] for x in annos] else: masks = None if "keypoints" in annos[0]: keypts = [x["keypoints"] for x in annos] keypts = np.array(keypts).reshape(len(annos), -1, 3) else: keypts = None boxes = [ BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) if len(x["bbox"]) == 4 else x["bbox"] for x in annos ] colors = None category_ids = [x["category_id"] for x in annos] if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get( "thing_colors"): colors = [ self._jitter( [x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids ] names = self.metadata.get("thing_classes", None) labels = _create_text_labels( category_ids, scores=None, class_names=[ "Hv", "Hp", "CLS", "BL", "PD", "PB", "CC", "LM", "D/P" ], is_crowd=[x.get("iscrowd", 0) for x in annos], ) boxes = None alpha = 0 self.overlay_instances(labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors, alpha=alpha) sem_seg = dic.get("sem_seg", None) if sem_seg is None and "sem_seg_file_name" in dic: with PathManager.open(dic["sem_seg_file_name"], "rb") as f: sem_seg = Image.open(f) sem_seg = np.asarray(sem_seg, dtype="uint8") if sem_seg is not None: self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) pan_seg = dic.get("pan_seg", None) if pan_seg is None and "pan_seg_file_name" in dic: with PathManager.open(dic["pan_seg_file_name"], "rb") as f: pan_seg = Image.open(f) pan_seg = np.asarray(pan_seg) from panopticapi.utils import rgb2id pan_seg = rgb2id(pan_seg) if pan_seg is not None: segments_info = dic["segments_info"] pan_seg = torch.Tensor(pan_seg) self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) return self.output
def instances_to_coco_json(instances, img_id): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() attr_classes = instances.attr_classes.tolist() attr_scores = instances.attr_scores.tolist() #print (len(scores), len(attr_scores), len(attr_classes)) has_mask = instances.has("pred_masks") if has_mask: # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset rles = [ mask_util.encode( np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], "attributes": attr_classes[k], "attr_scores": attr_scores[k], } if has_mask: result["segmentation"] = rles[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results
def _original_call(self, dataset_dict): """ Modified from detectron2's original __call__ in DatasetMapper """ dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below image = self._read_image(dataset_dict, format=self.img_format) if not self.backfill_size: utils.check_image_size(dataset_dict, image) if "annotations" not in dataset_dict: image, transforms = T.apply_transform_gens( ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image) else: # Crop around an instance if there are instances in the image. # USER: Remove if you don't use cropping if self.crop_gen: crop_tfm = utils.gen_crop_transform_with_instance( self.crop_gen.get_crop_size(image.shape[:2]), image.shape[:2], np.random.choice(dataset_dict["annotations"]), ) image = crop_tfm.apply_image(image) image, transforms = T.apply_transform_gens(self.tfm_gens, image) if self.crop_gen: transforms = crop_tfm + transforms image_shape = image.shape[:2] # h, w dataset_dict["image"] = torch.as_tensor( image.transpose(2, 0, 1).astype("float32")) # Can use uint8 if it turns out to be slow some day assert not self.load_proposals, "Not supported!" if not self.is_train: dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: for anno in dataset_dict["annotations"]: if not self.mask_on: anno.pop("segmentation", None) if not self.keypoint_on: anno.pop("keypoints", None) # Convert dataset_dict["annotations"] to dataset_dict["instances"] annotations = [ obj for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] # Convert either rotated box or horizontal box to XYWHA_ABS format original_boxes = [ BoxMode.convert( box=obj["bbox"], from_mode=obj["bbox_mode"], to_mode=BoxMode.XYWHA_ABS, ) for obj in annotations ] transformed_boxes = transforms.apply_rotated_box( np.array(original_boxes, dtype=np.float64)) instances = Instances(image_shape) instances.gt_classes = torch.tensor( [obj["category_id"] for obj in annotations], dtype=torch.int64) instances.gt_boxes = RotatedBoxes(transformed_boxes) instances.gt_boxes.clip(image_shape) dataset_dict["instances"] = instances[ instances.gt_boxes.nonempty()] return dataset_dict
def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file try: image = utils.read_image(dataset_dict["file_name"], format=self.image_format) except Exception as e: print(dataset_dict["file_name"]) print(e) raise e try: utils.check_image_size(dataset_dict, image) except SizeMismatchError as e: expected_wh = (dataset_dict["width"], dataset_dict["height"]) image_wh = (image.shape[1], image.shape[0]) if (image_wh[1], image_wh[0]) == expected_wh: print("transposing image {}".format(dataset_dict["file_name"])) image = image.transpose(1, 0, 2) else: raise e # USER: Remove if you don't do semantic/panoptic segmentation. if "sem_seg_file_name" in dataset_dict: sem_seg_gt = utils.read_image( dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) else: sem_seg_gt = None boxes = np.asarray([ BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS) for instance in dataset_dict["annotations"] ]) aug_input = T.StandardAugInput(image, boxes=boxes, sem_seg=sem_seg_gt) transforms = aug_input.apply_augmentations(self.augmentation) image, sem_seg_gt = aug_input.image, aug_input.sem_seg image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor( np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: dataset_dict["sem_seg"] = torch.as_tensor( sem_seg_gt.astype("long")) # USER: Remove if you don't use pre-computed proposals. # Most users would not need this feature. if self.proposal_topk: utils.transform_proposals( dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk, min_box_size=self.proposal_min_box_size, ) if not self.is_train: dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) dataset_dict.pop("pano_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: if not self.use_instance_mask: anno.pop("segmentation", None) if not self.use_keypoint: anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ transform_instance_annotations( obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices, ) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] instances = annotations_to_instances( annos, image_shape, mask_format=self.instance_mask_format) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to if self.recompute_boxes: instances.gt_boxes = instances.gt_masks.get_bounding_boxes() dataset_dict["instances"] = utils.filter_empty_instances(instances) if self.basis_loss_on and self.is_train: # load basis supervisions if self.ann_set == "coco": basis_sem_path = (dataset_dict["file_name"].replace( "train2017", "thing_train2017").replace("image/train", "thing_train")) else: basis_sem_path = (dataset_dict["file_name"].replace( "coco", "lvis").replace("train2017", "thing_train")) # change extension to npz basis_sem_path = osp.splitext(basis_sem_path)[0] + ".npz" basis_sem_gt = np.load(basis_sem_path)["mask"] basis_sem_gt = transforms.apply_segmentation(basis_sem_gt) basis_sem_gt = torch.as_tensor(basis_sem_gt.astype("long")) dataset_dict["basis_sem"] = basis_sem_gt return dataset_dict
def convert_output_to_json(outputs, image_filename, metadata): reverse_id_mapping = { v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items() } uid = common.createUUID('pred') boxes = outputs['instances'].pred_boxes.tensor.cpu().numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = outputs['instances'].scores.tolist() category_id = outputs['instances'].pred_classes.tolist() classes = [] for cat in category_id: cat_name = reverse_id_mapping[cat] classes.append(cat_name) num_instances = len(scores) print(outputs) if num_instances == 0: return [] for k in range(num_instances): if k == 0: jsonres = { image_filename: { "filename": image_filename, "size": 0, "regions": [ { "region_attributes": { "label": classes[k], "score": scores[k], }, "shape_attributes": { "name": "rect", "y": boxes[k][0], "x": boxes[k][1], "height": boxes[k][2], "width": boxes[k][3] } }, ], "file_attributes": { "width": 1920, "height": 1280, "uuid": uid } } } else: jsonres[image_filename]["regions"].append({ "region_attributes": { "label": classes[k], "score": scores[k], }, "shape_attributes": { "name": "rect", "y": boxes[k][0], "x": boxes[k][1], "height": boxes[k][2], "width": boxes[k][3] } }) return jsonres
def convert_to_coco_dict(dataset_name): """ Convert an instance detection/segmentation or keypoint detection dataset in detectron2's standard format into COCO json format. Generic dataset description can be found here: https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset COCO data format description can be found here: http://cocodataset.org/#format-data Args: dataset_name (str): name of the source dataset Must be registered in DatastCatalog and in detectron2's standard format. Must have corresponding metadata "thing_classes" Returns: coco_dict: serializable dict in COCO json format """ dataset_dicts = DatasetCatalog.get(dataset_name) metadata = MetadataCatalog.get(dataset_name) # unmap the category mapping ids for COCO if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = { v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items() } reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[ contiguous_id] # noqa else: reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa categories = [{ "id": reverse_id_mapper(id), "name": name } for id, name in enumerate(metadata.thing_classes)] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": image_dict["width"], "height": image_dict["height"], "file_name": image_dict["file_name"], } coco_images.append(coco_image) anns_per_image = image_dict["annotations"] for annotation in anns_per_image: # create a new dict with only COCO fields coco_annotation = {} # COCO requirement: XYWH box format bbox = annotation["bbox"] bbox_mode = annotation["bbox_mode"] bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) # COCO requirement: instance area if "segmentation" in annotation: # Computing areas for instances by counting the pixels segmentation = annotation["segmentation"] # TODO: check segmentation type: RLE, BinaryMask or Polygon if isinstance(segmentation, list): polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() elif isinstance(segmentation, dict): # RLE area = mask_util.area(segmentation) else: raise TypeError( f"Unknown segmentation type {type(segmentation)}!") else: # Computing areas using bounding boxes bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] # list[int] for idx, v in enumerate(keypoints): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # For COCO format consistency we substract 0.5 # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) # COCO requirement: # linking annotations to images # "id" field must start with 1 coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = area coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) coco_annotation["category_id"] = reverse_id_mapper( annotation["category_id"]) # Add optional fields if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: coco_annotation["segmentation"] = annotation["segmentation"] coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"num images: {len(coco_images)}, num annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for Detectron2.", } coco_dict = { "info": info, "images": coco_images, "annotations": coco_annotations, "categories": categories, "licenses": None, } return coco_dict
def lincomb_mask_loss(self, gt_classes, mask_coef, proto_mask, gt_instances, gt_matched_idxs): """ Args: gt_classes: shapes are (N, R). See :meth:`Yolact.get_ground_truth`. mask_coef (list[Tensor]): lvl tensors, each has shape (N, Ax#masks, Hi, Wi). See :meth:`YolactHead.forward`. proto_mask (Tensor): shapes are (N, #masks, M, M). gt_instances (list[Instances]): a list of N `Instances`s. gt_matched_idxs (list[Tensor[int64]]): each element is a vector of length R, where gt_matched_idxs[i] is a matched ground-truth index in [0, #objects) Return: loss_mask [dict]: mask loss scalar. maskiou_data (list[inputs, targets, classes]): the input of maskiou_net. """ mask_size = proto_mask.size()[-2:] mask_area = mask_size[0] * mask_size[1] # shape: (N, M, M, #masks) proto_mask = proto_mask.permute(0, 2, 3, 1).contiguous() gt_masks = [] gt_boxes = [] gt_boxes_area = [] # for normalize weight gt_masks_area = [] # for discard_mask_area mask_weights = [] with torch.no_grad(): for i, instance_per_image in enumerate(gt_instances): gt_mask = instance_per_image.gt_masks.to(device=proto_mask.device).tensor gt_mask = gt_mask.permute(1,2,0).contiguous() gt_mask = F.interpolate(gt_mask, mask_size, mode="bilinear", align_corners=False) # gt_mask: shape (M, M, #objects) gt_mask = gt_mask.gt(0.5).float() gt_masks.append(gt_mask) gt_masks_area.append(gt_mask.sum(dim=(0, 1))) # mask weights gt_foreground_norm = gt_mask / (gt_mask.sum(dim=(0,1), keepdim=True) + 0.0001) gt_background_norm = (1-gt_mask) / ((1-gt_mask).sum(dim=(0,1), keepdim=True) + 0.0001) mask_weight = (gt_foreground_norm * self.mask_reweight + gt_background_norm) * mask_area mask_weights.append(mask_weight) # :class:`Boxes` shape (#objects, 4) # convert to relative coordinate to crop mask gt_box = BoxMode.convert(instance_per_image.gt_boxes, BoxMode.XYXY_ABS, BoxMode.XYXY_REL) gt_boxes.append(gt_box.tensor) # area(#objects) gt_boxes_area.append(gt_box.area()) # convert to aligned with gt_classes mask_coef = [permute_to_N_HWA_K(x, self.num_masks) for x in mask_coef] # Tensor shape (N, R, #masks) mask_coef = cat(mask_coef, dim=1) mask_loss = 0 maskiou_inputs = [] maskiou_targets = [] maskiou_classes = [] # combine mask_coef and proto_mask to generate pred_mask of each image # and calculate loss for i in range(len(gt_instances)): # gt_class gt_class = gt_classes[i] # -1: ignore, #num_classes: background foreground_idxs = (gt_class >= 0) & (gt_class != self.num_classes) pred_coef = mask_coef[i, foreground_idxs] # matrix multiply get shape (M, M, #pos) pred_mask = F.sigmoid(proto_mask[i] @ pred_coef.t()) # matched ground truth objects' idx gt_matched_idx = gt_matched_idxs[i][foreground_idxs] # generate gt_masks gt_box = gt_boxes[i][gt_matched_idx] gt_mask = gt_masks[i][gt_matched_idx] # crop mask using gt_box pred_mask = crop(pred_mask, gt_box) pre_loss = F.binary_cross_entropy( torch.clamp(pred_mask, 0, 1), gt_mask, reduction='none') # mask_proto_reweight_mask_loss: foreground and background has different weights pre_loss = pre_loss * mask_weights[i][:, :, gt_matched_idx] # mask_proto_normalize_emulate_roi_pooling: # Normalize the mask loss to emulate roi pooling's affect on loss. pre_loss = pre_loss.sum(dim=(0, 1)) * (mask_area / gt_boxes_area[i]) mask_loss += pre_loss.sum() # cfg.use_maskiou select = gt_masks_area[i] > self.discard_mask_area if select.sum() > 0: pred_mask = pred_mask[:, :, select] gt_mask = gt_mask[:, :, select] gt_class = gt_class[select] # maskiou net input: (N, 1, H, W) maskiou_input = pred_mask.permute(2, 0, 1).contiguous().unsqueeze(1) pred_mask = pred_mask.gt(0.5).float() # maskiou net target: (N) maskiou_target = mask_iou(pred_mask, gt_mask) maskiou_inputs.append(maskiou_input) maskiou_targets.append(maskiou_target) maskiou_classes.append(gt_class) losses = {"loss_mask": mask_loss / mask_area * self.mask_alpha} if len(maskiou_targets) == 0: return losses, None else: # all images have same size masks # so the tensor are shape (N*I, 1, H, W) maskiou_targets = torch.cat(maskiou_targets) maskiou_classes = torch.cat(maskiou_classes) maskiou_inputs = torch.cat(maskiou_inputs) return losses, (maskiou_inputs, maskiou_targets, maskiou_classes)
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": # TODO check type and provide better error masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm)) ) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) ) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def annotations_to_instances_with_attributes(annos, image_size, mask_format="polygon", load_attributes=False, max_attr_per_ins=16): """ Extend the function annotations_to_instances() to support attributes """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) if len(annos) and load_attributes: attributes = -torch.ones( (len(annos), max_attr_per_ins), dtype=torch.int64) for idx, anno in enumerate(annos): if "attribute_ids" in anno: for jdx, attr_id in enumerate(anno["attribute_ids"]): attributes[idx, jdx] = attr_id target.gt_attributes = attributes return target
def _convert_xywha_to_xyxy(self, x): return BoxMode.convert(x, BoxMode.XYWHA_ABS, BoxMode.XYXY_ABS)
def evaluate_box_proposal(predictions, coco_api, thresholds=None, aspect_ratio_range=None, limit=None, oriented=False): gt_overlaps = [] num_pos = 0 for prediction_dict in tqdm.tqdm(predictions): image_id = prediction_dict["image_id"] predictions = prediction_dict["instances"] predict_boxes = [ BoxMode.convert(prediction['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for prediction in predictions ] predict_boxes = torch.as_tensor(predict_boxes).reshape(-1, 4) predict_boxes = Boxes(predict_boxes) ann_ids = coco_api.getAnnIds(imgIds=image_id) anno = coco_api.loadAnns(ann_ids) anno = [obj for obj in anno if obj["iscrowd"] == 0] gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] if oriented: gt_aspect_ratios = [] for obj in anno: if obj["iscrowd"]: gt_aspect_ratios.append(obj["bbox"][2] / obj["bbox"][3]) else: segmentations = PolygonMasks([obj["segmentation"]]) ratios = segmentations.get_ratios(oriented=True) gt_aspect_ratios += ratios else: gt_aspect_ratios = [ obj["bbox"][2] / obj["bbox"][3] # w / h ==> aspect ratio for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape( -1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_aspect_ratios = torch.as_tensor(gt_aspect_ratios) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_aspect_ratios >= aspect_ratio_range[0]) & \ (gt_aspect_ratios <= aspect_ratio_range[1]) gt_boxes = gt_boxes[valid_gt_inds] if len(gt_boxes) == 0: continue num_pos += len(gt_boxes) if limit is not None and len(predictions) > limit: predict_boxes = predict_boxes[:limit] overlaps = pairwise_iou(predict_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = (torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def draw_dataset_dict(self, dic, assigned_colors=None): """ Draw annotations/segmentaions in Detectron2 Dataset format. Args: dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. Returns: output (VisImage): image object with visualizations. """ annos = dic.get("annotations", None) if annos: if "segmentation" in annos[0]: masks = [x["segmentation"] for x in annos] else: masks = None if "keypoints" in annos[0]: keypts = [x["keypoints"] for x in annos] keypts = np.array(keypts).reshape(len(annos), -1, 3) else: keypts = None boxes = [BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) for x in annos] labels = [x["category_id"] for x in annos] colors = assigned_colors if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in labels ] names = self.metadata.get("thing_classes", None) if names: labels = [names[i] for i in labels] labels = [ "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "") for i, a in zip(labels, annos) ] self.overlay_instances( labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors ) sem_seg = dic.get("sem_seg", None) if sem_seg is None and "sem_seg_file_name" in dic: with PathManager.open(dic["sem_seg_file_name"], "rb") as f: sem_seg = Image.open(f) sem_seg = np.asarray(sem_seg, dtype="uint8") if sem_seg is not None: self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) pan_seg = dic.get("pan_seg", None) if pan_seg is None and "pan_seg_file_name" in dic: assert "segments_info" in dic with PathManager.open(dic["pan_seg_file_name"], "rb") as f: pan_seg = Image.open(f) pan_seg = np.asarray(pan_seg) from panopticapi.utils import rgb2id pan_seg = rgb2id(pan_seg) segments_info = dic["segments_info"] if pan_seg is not None: pan_seg = torch.Tensor(pan_seg) self.draw_panoptic_seg_predictions(pan_seg, segments_info, area_threshold=0, alpha=0.5) return self.output
def _evaluate_predictions_ar( predictions, coco_api, metadata, thresholds=None, aspect_ratios={}, areas={}, limit=None): cats = coco_api.cats.values() ratios = list(aspect_ratios.values()) areas = list(areas.values()) K = len(cats) + 1 # -1 for all classes R = len(ratios) A = len(areas) # Area ranges counts_matrixes = [] overlap_matrixes = [] gt_overlaps = [] for prediction_dict in predictions: count_matrix = torch.zeros((K, R, A), dtype=torch.int32) image_id = prediction_dict["image_id"] predictions = prediction_dict["instances"] predict_boxes = [ BoxMode.convert(prediction['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for prediction in predictions ] predict_classes = torch.tensor([ prediction["category_id"] for prediction in predictions ]) predict_boxes = torch.as_tensor(predict_boxes).reshape(-1, 4) predict_boxes = Boxes(predict_boxes) ann_ids = coco_api.getAnnIds(imgIds=image_id) anno = coco_api.loadAnns(ann_ids) anno = [obj for obj in anno if obj["iscrowd"] == 0] gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_classes = torch.tensor([ metadata.thing_dataset_id_to_contiguous_id[obj["category_id"]] for obj in anno]) gt_aspect_ratios = [ obj["ratio"] for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_aspect_ratios = torch.as_tensor(gt_aspect_ratios) gt_areas = torch.as_tensor( [(box[2] - box[0]) * (box[3] - box[1]) for box in gt_boxes]) if len(gt_boxes) == 0 or len(predictions) == 0: continue if len(gt_boxes) == 0: continue N = len(gt_boxes) overlap_matrix = torch.zeros((K, R, A, N), dtype=torch.float32) for i in range(len(gt_boxes)): k = gt_classes[i] r = between_ranges(gt_aspect_ratios[i], ratios) a = torch.tensor(between_ranges(gt_areas[i], areas)).nonzero() count_matrix[k, r, a] += 1 count_matrix[-1, r, a] += 1 if limit is not None and len(predictions) > limit: predict_boxes = predict_boxes[:limit] overlaps = pairwise_iou(predict_boxes, gt_boxes) class_matched = predict_classes[:, None] == gt_classes[None] overlaps_when_matched = overlaps * class_matched for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) max_overlaps_m, argmax_overlaps_m = overlaps_when_matched.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) gt_ovr_m, gt_ind_m = max_overlaps_m.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] box_ind_m = argmax_overlaps_m[gt_ind_m] # record the iou coverage of this gt box k = gt_classes[gt_ind_m] r = between_ranges(gt_aspect_ratios[gt_ind_m], ratios) a = torch.tensor(between_ranges(gt_areas[gt_ind_m], areas)).nonzero() n = (torch.arange(N) == j).nonzero() overlap_matrix[k, r, a, n] = overlaps_when_matched[box_ind_m, gt_ind_m] overlap_matrix[-1, r, a, n] = overlaps[box_ind, gt_ind] assert torch.all(overlap_matrix[k, r, a, n] == gt_ovr_m) assert torch.all(overlap_matrix[-1, r, a, n] == gt_ovr) # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 overlaps_when_matched[box_ind_m, :] = -1 overlaps_when_matched[:, gt_ind_m] = -1 # append recorded iou coverage level overlap_matrixes.append(overlap_matrix) counts_matrixes.append(count_matrix) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) T = len(thresholds) recalls = torch.zeros((T, K, R, A)) # compute recall for each iou threshold for i, t in enumerate(thresholds): count = torch.zeros((K, R, A)) hit = torch.zeros((K, R, A)) for count_matrix, overlap_matrix in zip(counts_matrixes, overlap_matrixes): hit_matrix = (overlap_matrix >= t).float().sum(-1) count += count_matrix hit += hit_matrix recalls[i] = hit / torch.max( count.float(), torch.tensor(1).float()) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls[:, -1, 0, 0].mean() mar = recalls[:, :-1, 0, 0].mean() return { "ar": ar, "mar": mar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": torch.stack(counts_matrixes).sum(0), }
def process(self, inputs, outputs): """ Args: inputs: the inputs to a model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"0": {}, "1": {}} tmp_instances = {"0": {}, "1": {}} for i in range(2): # TODO this is ugly prediction[str(i)]["image_id"] = input[str(i)]["image_id"] prediction[str(i)]["file_name"] = input[str(i)]["file_name"] if "instances" in output[str(i)]: instances = output[str(i)]["instances"].to( self._cpu_device) prediction[str(i)]["instances"] = instances_to_coco_json( instances, input[str(i)]["image_id"]) tmp_instances[str(i)]["embeddingbox"] = { "pred_boxes": instances.pred_boxes, "scores": instances.scores, } if "proposals" in output[str(i)]: prediction[str(i)]["proposals"] = output[str( i)]["proposals"].to(self._cpu_device) if "annotations" in input[str(i)]: tmp_instances[str(i)]["gt_bbox"] = [ ann["bbox"] for ann in input[str(i)]["annotations"] ] if len(input[str(i)]["annotations"]) > 0: tmp_instances[str(i)]["gt_bbox"] = np.array( tmp_instances[str(i)]["gt_bbox"]).reshape( -1, 4) # xywh from coco original_mode = input[str( i)]["annotations"][0]["bbox_mode"] tmp_instances[str(i)]["gt_bbox"] = BoxMode.convert( tmp_instances[str(i)]["gt_bbox"], BoxMode(original_mode), BoxMode.XYXY_ABS, ) if hasattr(output[str(i)]["instances"], "pred_plane"): prediction[str(i)]["pred_plane"] = output[str( i)]["instances"].pred_plane.to( self._cpu_device) if output["depth"][str(i)] is not None: prediction[str(i)]["pred_depth"] = output["depth"][str( i)].to(self._cpu_device) xyz = self.depth2XYZ(output["depth"][str(i)]) prediction[str(i)] = self.override_offset( xyz, prediction[str(i)], output[str(i)]) depth_rst = get_depth_err( output["depth"][str(i)], input[str(i)]["depth"].to(self._device)) prediction[str(i)]["depth_l1_dist"] = depth_rst.to( self._cpu_device) if "pred_aff" in output: tmp_instances["pred_aff"] = output["pred_aff"].to( self._cpu_device) if "geo_aff" in output: tmp_instances["geo_aff"] = output["geo_aff"].to( self._cpu_device) if "emb_aff" in output: tmp_instances["emb_aff"] = output["emb_aff"].to( self._cpu_device) if "gt_corrs" in input: tmp_instances["gt_corrs"] = input["gt_corrs"] prediction["corrs"] = tmp_instances if "embedding" in self._plane_tasks: if self._eval_gt_box: aff_rst = get_affinity_label_score( tmp_instances, filter_iou=self._filter_iou, filter_score=self._filter_score, device=self._device, ) else: aff_rst = get_affinity_label_score( tmp_instances, hungarian_threshold=[], filter_iou=self._filter_iou, filter_score=self._filter_score, device=self._device, ) prediction.update(aff_rst) if "camera" in self._plane_tasks: camera_dict = { "logits": { "tran": output["camera"]["tran"].to(self._cpu_device), "rot": output["camera"]["rot"].to(self._cpu_device), }, "gts": { "tran": input["rel_pose"]["position"], "rot": input["rel_pose"]["rotation"], "tran_cls": input["rel_pose"]["tran_cls"], "rot_cls": input["rel_pose"]["rot_cls"], }, } prediction["camera"] = camera_dict self._predictions.append(prediction)
def amodal_instances_to_coco_json(instances, img_id, type="amodal"): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [], [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset if type == "amodal": amodal_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_amodal_masks ] visible_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_visible_masks ] area = [ (torch.sum(amodal_mask * visible_mask).float() / torch.sum(amodal_mask).float()).item() for amodal_mask, visible_mask in zip(instances.pred_amodal_masks, instances.pred_visible_masks) ] elif type == "amodal2": amodal_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_amodal2_masks ] visible_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_visible2_masks ] area = [ (torch.sum(amodal_mask * visible_mask).float() / torch.sum(amodal_mask).float()).item() for amodal_mask, visible_mask in zip(instances.pred_amodal2_masks, instances.pred_visible2_masks)] elif type == "amodal_ensemble": amodal_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_amodal_ensemble_masks ] visible_rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_visible_ensemble_masks ] area = [ (torch.sum(amodal_mask * visible_mask).float() / torch.sum(amodal_mask).float()).item() for amodal_mask, visible_mask in zip(instances.pred_visible_ensemble_masks, instances.pred_visible_ensemble_masks) ] # if type == "amodal": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_amodal_masks # ] # elif type == "visible": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_visible_masks # ] # elif type == "amodal2": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_amodal2_masks # ] # elif type == "visible2": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_visible2_masks # ] # elif type == "amodal_ensemble": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_amodal_ensemble_masks # ] # elif type == "visible_ensemble": # rles = [ # mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] # for mask in instances.pred_visible_ensemble_masks # ] else: raise ValueError("type == {} is not available") for amodal_rle, visible_rle in zip(amodal_rles, visible_rles): # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the pycocotools/_mask.pyx does). amodal_rle["counts"] = amodal_rle["counts"].decode("utf-8") visible_rle["counts"] = visible_rle["counts"].decode("utf-8") amodal_results = [] visible_results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], "segmentation": amodal_rles[k], "area": area[k] } amodal_results.append(result) for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], "segmentation": visible_rles[k], "area": area[k] } visible_results.append(result) return amodal_results, visible_results