def test_yuv_color_transforms(self): default_cfg = Detectron2GoRunner().get_default_cfg() img = np.concatenate( [ np.random.uniform(0, 1, size=(80, 60, 1)), np.random.uniform(-0.5, 0.5, size=(80, 60, 1)), np.random.uniform(-0.5, 0.5, size=(80, 60, 1)), ], axis=2, ) default_cfg.D2GO_DATA.AUG_OPS.TRAIN = [ 'RandomContrastYUVOp::{"intensity_min": 0.3, "intensity_max": 0.5}', ] low_contrast_tfm = build_transform_gen(default_cfg, is_train=True) low_contrast, _ = apply_augmentations(low_contrast_tfm, img) default_cfg.D2GO_DATA.AUG_OPS.TRAIN = [ 'RandomSaturationYUVOp::{"intensity_min": 1.5, "intensity_max": 1.7}', ] high_saturation_tfm = build_transform_gen(default_cfg, is_train=True) high_saturation, _ = apply_augmentations(high_saturation_tfm, img) # Use pixel statistics to roughly check transformed images as expected # All channels have less variance self.assertLess(np.var(low_contrast[:, :, 0]), np.var(img[:, :, 0])) self.assertLess(np.var(low_contrast[:, :, 1]), np.var(img[:, :, 1])) self.assertLess(np.var(low_contrast[:, :, 2]), np.var(img[:, :, 2])) # 1st channel is unchanged (test w/ mean, var), 2nd + 3rd channels more variance self.assertAlmostEqual(np.mean(high_saturation[:, :, 0]), np.mean(img[:, :, 0])) self.assertAlmostEqual(np.var(high_saturation[:, :, 0]), np.var(img[:, :, 0])) self.assertGreater(np.var(high_saturation[:, :, 1]), np.var(img[:, :, 1])) self.assertGreater(np.var(high_saturation[:, :, 2]), np.var(img[:, :, 2]))
def test_apply_rotated_boxes_unequal_scaling_factor(self): np.random.seed(125) h, w = 400, 200 newh, neww = 800, 800 image = np.random.rand(h, w) augs = [] augs.append(T.Resize(shape=(newh, neww))) image, transforms = T.apply_augmentations(augs, image) image_shape = image.shape[:2] # h, w assert image_shape == (newh, neww) boxes = np.array( [ [150, 100, 40, 20, 0], [150, 100, 40, 20, 30], [150, 100, 40, 20, 90], [150, 100, 40, 20, -90], ], dtype=np.float64, ) transformed_boxes = transforms.apply_rotated_box(boxes) expected_bboxes = np.array( [ [600, 200, 160, 40, 0], [600, 200, 144.22205102, 52.91502622, 49.10660535], [600, 200, 80, 80, 90], [600, 200, 80, 80, -90], ], dtype=np.float64, ) err_msg = "transformed_boxes = {}, expected {}".format( transformed_boxes, expected_bboxes) assert np.allclose(transformed_boxes, expected_bboxes), err_msg
def __call__(self, dataset_dict): """ Args: dict: a dict in standard model input format. See tutorials for details. Returns: list[dict]: a list of dicts, which contain augmented version of the input image. The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``. Each dict has field "transforms" which is a TransformList, containing the transforms that are used to generate this image. """ numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy() shape = numpy_image.shape orig_shape = (dataset_dict["height"], dataset_dict["width"]) if shape[:2] != orig_shape: # It transforms the "original" image in the dataset to the input image pre_tfm = ResizeTransform(orig_shape[0], orig_shape[1], shape[0], shape[1]) else: pre_tfm = NoOpTransform() # Create all combinations of augmentations to use aug_candidates = [] # each element is a list[Augmentation] for min_size in self.min_sizes: resize = ResizeShortestEdge(min_size, self.max_size) aug_candidates.append([resize]) # resize only if self.flip: flip = RandomFlip(prob=1.0) aug_candidates.append([resize, flip]) # resize + flip # Apply all the augmentations ret = [] for aug in aug_candidates: new_image, tfms = apply_augmentations(aug, np.copy(numpy_image)) torch_image = torch.from_numpy( np.ascontiguousarray(new_image.transpose(2, 0, 1))) dic = copy.deepcopy(dataset_dict) dic["transforms"] = pre_tfm + tfms dic["image"] = torch_image if self.proposal_topk is not None: image_shape = new_image.shape[:2] # h, w transform_proposals(dic, image_shape, tfms, proposal_topk=self.proposal_topk) ret.append(dic) return ret
def test_apply_rotated_boxes(self): np.random.seed(125) cfg = get_cfg() is_train = True augs = detection_utils.build_augmentation(cfg, is_train) image = np.random.rand(200, 300) image, transforms = T.apply_augmentations(augs, image) image_shape = image.shape[:2] # h, w assert image_shape == (800, 1200) annotation = {"bbox": [179, 97, 62, 40, -56]} boxes = np.array([annotation["bbox"]], dtype=np.float64) # boxes.shape = (1, 5) transformed_bbox = transforms.apply_rotated_box(boxes)[0] expected_bbox = np.array([484, 388, 248, 160, 56], dtype=np.float64) err_msg = "transformed_bbox = {}, expected {}".format(transformed_bbox, expected_bbox) assert np.allclose(transformed_bbox, expected_bbox), err_msg
def test_resize_and_crop(self): np.random.seed(125) min_scale = 0.2 max_scale = 2.0 target_height = 1100 target_width = 1000 resize_aug = T.ResizeScale(min_scale, max_scale, target_height, target_width) fixed_size_crop_aug = T.FixedSizeCrop((target_height, target_width)) hflip_aug = T.RandomFlip() augs = [resize_aug, fixed_size_crop_aug, hflip_aug] original_image = np.random.rand(900, 800) image, transforms = T.apply_augmentations(augs, original_image) image_shape = image.shape[:2] # h, w self.assertEqual((1100, 1000), image_shape) boxes = np.array( [[91, 46, 144, 111], [523, 251, 614, 295]], dtype=np.float64, ) transformed_bboxs = transforms.apply_box(boxes) expected_bboxs = np.array( [ [895.42, 33.42666667, 933.91125, 80.66], [554.0825, 182.39333333, 620.17125, 214.36666667], ], dtype=np.float64, ) err_msg = "transformed_bbox = {}, expected {}".format(transformed_bboxs, expected_bboxs) self.assertTrue(np.allclose(transformed_bboxs, expected_bboxs), err_msg) polygon = np.array([[91, 46], [144, 46], [144, 111], [91, 111]]) transformed_polygons = transforms.apply_polygons([polygon]) expected_polygon = np.array([[934.0, 33.0], [934.0, 80.0], [896.0, 80.0], [896.0, 33.0]]) self.assertEqual(1, len(transformed_polygons)) err_msg = "transformed_polygon = {}, expected {}".format( transformed_polygons[0], expected_polygon ) self.assertTrue(polygon_allclose(transformed_polygons[0], expected_polygon), err_msg)
def read_data(self, dataset_dict): """load image and annos random shift & scale bbox; crop, rescale.""" cfg = self.cfg r_head_cfg = cfg.MODEL.CDPN.ROT_HEAD pnp_net_cfg = cfg.MODEL.CDPN.PNP_NET dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below dataset_name = dataset_dict["dataset_name"] image = read_image_cv2(dataset_dict["file_name"], format=self.img_format) # should be consistent with the size in dataset_dict utils.check_image_size(dataset_dict, image) im_H_ori, im_W_ori = image.shape[:2] # currently only replace bg for train ############################### if self.split == "train": # some synthetic data already has bg, img_type should be real or something else but not syn img_type = dataset_dict.get("img_type", "real") if img_type == "syn": log_first_n(logging.WARNING, "replace bg", n=10) assert "segmentation" in dataset_dict["inst_infos"] mask = cocosegm2mask( dataset_dict["inst_infos"]["segmentation"], im_H_ori, im_W_ori) image, mask_trunc = self.replace_bg(image.copy(), mask, return_mask=True) else: # real image if np.random.rand() < cfg.INPUT.CHANGE_BG_PROB: log_first_n(logging.WARNING, "replace bg for real", n=10) assert "segmentation" in dataset_dict["inst_infos"] mask = cocosegm2mask( dataset_dict["inst_infos"]["segmentation"], im_H_ori, im_W_ori) image, mask_trunc = self.replace_bg(image.copy(), mask, return_mask=True) else: mask_trunc = None # NOTE: maybe add or change color augment here =================================== if self.split == "train" and self.color_aug_prob > 0 and self.color_augmentor is not None: if np.random.rand() < self.color_aug_prob: if cfg.INPUT.COLOR_AUG_SYN_ONLY and img_type not in ["real"]: image = self._color_aug(image, self.color_aug_type) else: image = self._color_aug(image, self.color_aug_type) # other transforms (mainly geometric ones); # for 6d pose task, flip is now allowed in general except for some 2d keypoints methods image, transforms = T.apply_augmentations(self.augmentation, image) im_H, im_W = image_shape = image.shape[:2] # h, w # NOTE: scale camera intrinsic if necessary ================================ scale_x = im_W / im_W_ori scale_y = im_H / im_H_ori # NOTE: generally scale_x should be equal to scale_y if "cam" in dataset_dict: if im_W != im_W_ori or im_H != im_H_ori: dataset_dict["cam"][0] *= scale_x dataset_dict["cam"][1] *= scale_y K = dataset_dict["cam"].astype("float32") dataset_dict["cam"] = torch.as_tensor(K) input_res = cfg.MODEL.CDPN.BACKBONE.INPUT_RES out_res = cfg.MODEL.CDPN.BACKBONE.OUTPUT_RES # CHW -> HWC coord_2d = get_2d_coord_np(im_W, im_H, low=0, high=1).transpose(1, 2, 0) ################################################################################# if self.split != "train": # don't load annotations at test time test_bbox_type = cfg.TEST.TEST_BBOX_TYPE if test_bbox_type == "gt": bbox_key = "bbox" else: bbox_key = f"bbox_{test_bbox_type}" assert not self.flatten, "Do not use flattened dicts for test!" # here get batched rois roi_infos = {} # yapf: disable roi_keys = ["scene_im_id", "file_name", "cam", "im_H", "im_W", "roi_img", "inst_id", "roi_coord_2d", "roi_cls", "score", "roi_extent", bbox_key, "bbox_mode", "bbox_center", "roi_wh", "scale", "resize_ratio", "model_info", ] for _key in roi_keys: roi_infos[_key] = [] # yapf: enable # TODO: how to handle image without detections # filter those when load annotations or detections, implement a function for this # "annotations" means detections for inst_i, inst_infos in enumerate(dataset_dict["annotations"]): # inherent image-level infos roi_infos["scene_im_id"].append(dataset_dict["scene_im_id"]) roi_infos["file_name"].append(dataset_dict["file_name"]) roi_infos["im_H"].append(im_H) roi_infos["im_W"].append(im_W) roi_infos["cam"].append(dataset_dict["cam"].cpu().numpy()) # roi-level infos roi_infos["inst_id"].append(inst_i) roi_infos["model_info"].append(inst_infos["model_info"]) roi_cls = inst_infos["category_id"] roi_infos["roi_cls"].append(roi_cls) roi_infos["score"].append(inst_infos["score"]) # extent roi_extent = self._get_extents(dataset_name)[roi_cls] roi_infos["roi_extent"].append(roi_extent) bbox = BoxMode.convert(inst_infos[bbox_key], inst_infos["bbox_mode"], BoxMode.XYXY_ABS) bbox = np.array(transforms.apply_box([bbox])[0]) roi_infos[bbox_key].append(bbox) roi_infos["bbox_mode"].append(BoxMode.XYXY_ABS) x1, y1, x2, y2 = bbox bbox_center = np.array([0.5 * (x1 + x2), 0.5 * (y1 + y2)]) bw = max(x2 - x1, 1) bh = max(y2 - y1, 1) scale = max(bh, bw) * cfg.INPUT.DZI_PAD_SCALE scale = min(scale, max(im_H, im_W)) * 1.0 roi_infos["bbox_center"].append(bbox_center.astype("float32")) roi_infos["scale"].append(scale) roi_infos["roi_wh"].append(np.array([bw, bh], dtype=np.float32)) roi_infos["resize_ratio"].append(out_res / scale) # CHW, float32 tensor # roi_image roi_img = crop_resize_by_warp_affine( image, bbox_center, scale, input_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) roi_img = self.normalize_image(cfg, roi_img) roi_infos["roi_img"].append(roi_img.astype("float32")) # roi_coord_2d roi_coord_2d = crop_resize_by_warp_affine( coord_2d, bbox_center, scale, out_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) # HWC -> CHW roi_infos["roi_coord_2d"].append( roi_coord_2d.astype("float32")) for _key in roi_keys: if _key in ["roi_img", "roi_coord_2d"]: dataset_dict[_key] = torch.as_tensor( roi_infos[_key]).contiguous() elif _key in ["model_info", "scene_im_id", "file_name"]: # can not convert to tensor dataset_dict[_key] = roi_infos[_key] else: dataset_dict[_key] = torch.tensor(roi_infos[_key]) return dataset_dict ####################################################################################### # NOTE: currently assume flattened dicts for train assert self.flatten, "Only support flattened dicts for train now" inst_infos = dataset_dict.pop("inst_infos") dataset_dict["roi_cls"] = roi_cls = inst_infos["category_id"] # extent roi_extent = self._get_extents(dataset_name)[roi_cls] dataset_dict["roi_extent"] = torch.tensor(roi_extent, dtype=torch.float32) # load xyz ======================================================= xyz_info = mmcv.load(inst_infos["xyz_path"]) x1, y1, x2, y2 = xyz_info["xyxy"] # float16 does not affect performance (classification/regresion) xyz_crop = xyz_info["xyz_crop"] xyz = np.zeros((im_H, im_W, 3), dtype=np.float32) xyz[y1:y2 + 1, x1:x2 + 1, :] = xyz_crop # NOTE: full mask mask_obj = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype(np.bool).astype(np.float32) if cfg.INPUT.SMOOTH_XYZ: xyz = self.smooth_xyz(xyz) if cfg.TRAIN.VIS: xyz = self.smooth_xyz(xyz) # override bbox info using xyz_infos inst_infos["bbox"] = [x1, y1, x2, y2] inst_infos["bbox_mode"] = BoxMode.XYXY_ABS # USER: Implement additional transformations if you have other types of data # inst_infos.pop("segmentation") # NOTE: use mask from xyz anno = transform_instance_annotations(inst_infos, transforms, image_shape, keypoint_hflip_indices=None) # augment bbox =================================================== bbox_xyxy = anno["bbox"] bbox_center, scale = self.aug_bbox(cfg, bbox_xyxy, im_H, im_W) bw = max(bbox_xyxy[2] - bbox_xyxy[0], 1) bh = max(bbox_xyxy[3] - bbox_xyxy[1], 1) # CHW, float32 tensor ## roi_image ------------------------------------ roi_img = crop_resize_by_warp_affine( image, bbox_center, scale, input_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) roi_img = self.normalize_image(cfg, roi_img) # roi_coord_2d ---------------------------------------------------- roi_coord_2d = crop_resize_by_warp_affine( coord_2d, bbox_center, scale, out_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) ## roi_mask --------------------------------------- # (mask_trunc < mask_visib < mask_obj) mask_visib = anno["segmentation"].astype("float32") * mask_obj if mask_trunc is None: mask_trunc = mask_visib else: mask_trunc = mask_visib * mask_trunc.astype("float32") if cfg.TRAIN.VIS: mask_xyz_interp = cv2.INTER_LINEAR else: mask_xyz_interp = cv2.INTER_NEAREST # maybe truncated mask (true mask for rgb) roi_mask_trunc = crop_resize_by_warp_affine( mask_trunc[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) # use original visible mask to calculate xyz loss (try full obj mask?) roi_mask_visib = crop_resize_by_warp_affine( mask_visib[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) roi_mask_obj = crop_resize_by_warp_affine( mask_obj[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) ## roi_xyz ---------------------------------------------------- roi_xyz = crop_resize_by_warp_affine(xyz, bbox_center, scale, out_res, interpolation=mask_xyz_interp) # region label if r_head_cfg.NUM_REGIONS > 1: fps_points = self._get_fps_points(dataset_name)[roi_cls] roi_region = xyz_to_region(roi_xyz, fps_points) # HW dataset_dict["roi_region"] = torch.as_tensor( roi_region.astype(np.int32)).contiguous() roi_xyz = roi_xyz.transpose(2, 0, 1) # HWC-->CHW # normalize xyz to [0, 1] using extent roi_xyz[0] = roi_xyz[0] / roi_extent[0] + 0.5 roi_xyz[1] = roi_xyz[1] / roi_extent[1] + 0.5 roi_xyz[2] = roi_xyz[2] / roi_extent[2] + 0.5 if ("CE" in r_head_cfg.XYZ_LOSS_TYPE) or ( "cls" in cfg.MODEL.CDPN.NAME): # convert target to int for cls # assume roi_xyz has been normalized in [0, 1] roi_xyz_bin = np.zeros_like(roi_xyz) roi_x_norm = roi_xyz[0] roi_x_norm[roi_x_norm < 0] = 0 # clip roi_x_norm[roi_x_norm > 0.999999] = 0.999999 # [0, BIN-1] roi_xyz_bin[0] = np.asarray(roi_x_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) roi_y_norm = roi_xyz[1] roi_y_norm[roi_y_norm < 0] = 0 roi_y_norm[roi_y_norm > 0.999999] = 0.999999 roi_xyz_bin[1] = np.asarray(roi_y_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) roi_z_norm = roi_xyz[2] roi_z_norm[roi_z_norm < 0] = 0 roi_z_norm[roi_z_norm > 0.999999] = 0.999999 roi_xyz_bin[2] = np.asarray(roi_z_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) # the last bin is for bg roi_masks = { "trunc": roi_mask_trunc, "visib": roi_mask_visib, "obj": roi_mask_obj } roi_mask_xyz = roi_masks[r_head_cfg.XYZ_LOSS_MASK_GT] roi_xyz_bin[0][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN roi_xyz_bin[1][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN roi_xyz_bin[2][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN if "CE" in r_head_cfg.XYZ_LOSS_TYPE: dataset_dict["roi_xyz_bin"] = torch.as_tensor( roi_xyz_bin.astype("uint8")).contiguous() if "/" in r_head_cfg.XYZ_LOSS_TYPE and len( r_head_cfg.XYZ_LOSS_TYPE.split("/")[1]) > 0: dataset_dict["roi_xyz"] = torch.as_tensor( roi_xyz.astype("float32")).contiguous() else: dataset_dict["roi_xyz"] = torch.as_tensor( roi_xyz.astype("float32")).contiguous() # pose targets ---------------------------------------------------------------------- pose = inst_infos["pose"] allo_pose = egocentric_to_allocentric(pose) quat = inst_infos["quat"] allo_quat = mat2quat(allo_pose[:3, :3]) # ====== actually not needed ========== if pnp_net_cfg.ROT_TYPE == "allo_quat": dataset_dict["allo_quat"] = torch.as_tensor( allo_quat.astype("float32")) elif pnp_net_cfg.ROT_TYPE == "ego_quat": dataset_dict["ego_quat"] = torch.as_tensor(quat.astype("float32")) # rot6d elif pnp_net_cfg.ROT_TYPE == "ego_rot6d": dataset_dict["ego_rot6d"] = torch.as_tensor( mat_to_ortho6d_np(pose[:3, :3].astype("float32"))) elif pnp_net_cfg.ROT_TYPE == "allo_rot6d": dataset_dict["allo_rot6d"] = torch.as_tensor( mat_to_ortho6d_np(allo_pose[:3, :3].astype("float32"))) # log quat elif pnp_net_cfg.ROT_TYPE == "ego_log_quat": dataset_dict["ego_log_quat"] = quaternion_lf.qlog( torch.as_tensor(quat.astype("float32"))[None])[0] elif pnp_net_cfg.ROT_TYPE == "allo_log_quat": dataset_dict["allo_log_quat"] = quaternion_lf.qlog( torch.as_tensor(allo_quat.astype("float32"))[None])[0] # lie vec elif pnp_net_cfg.ROT_TYPE == "ego_lie_vec": dataset_dict["ego_lie_vec"] = lie_algebra.rot_to_lie_vec( torch.as_tensor(pose[:3, :3].astype("float32")[None]))[0] elif pnp_net_cfg.ROT_TYPE == "allo_lie_vec": dataset_dict["allo_lie_vec"] = lie_algebra.rot_to_lie_vec( torch.as_tensor(allo_pose[:3, :3].astype("float32"))[None])[0] else: raise ValueError(f"Unknown rot type: {pnp_net_cfg.ROT_TYPE}") dataset_dict["ego_rot"] = torch.as_tensor( pose[:3, :3].astype("float32")) dataset_dict["trans"] = torch.as_tensor( inst_infos["trans"].astype("float32")) dataset_dict["roi_points"] = torch.as_tensor( self._get_model_points(dataset_name)[roi_cls].astype("float32")) dataset_dict["sym_info"] = self._get_sym_infos(dataset_name)[roi_cls] dataset_dict["roi_img"] = torch.as_tensor( roi_img.astype("float32")).contiguous() dataset_dict["roi_coord_2d"] = torch.as_tensor( roi_coord_2d.astype("float32")).contiguous() dataset_dict["roi_mask_trunc"] = torch.as_tensor( roi_mask_trunc.astype("float32")).contiguous() dataset_dict["roi_mask_visib"] = torch.as_tensor( roi_mask_visib.astype("float32")).contiguous() dataset_dict["roi_mask_obj"] = torch.as_tensor( roi_mask_obj.astype("float32")).contiguous() dataset_dict["bbox_center"] = torch.as_tensor(bbox_center, dtype=torch.float32) dataset_dict["scale"] = scale dataset_dict["bbox"] = anno["bbox"] # NOTE: original bbox dataset_dict["roi_wh"] = torch.as_tensor( np.array([bw, bh], dtype=np.float32)) dataset_dict["resize_ratio"] = resize_ratio = out_res / scale z_ratio = inst_infos["trans"][2] / resize_ratio obj_center = anno["centroid_2d"] delta_c = obj_center - bbox_center dataset_dict["trans_ratio"] = torch.as_tensor( [delta_c[0] / bw, delta_c[1] / bh, z_ratio]).to(torch.float32) return dataset_dict
def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file try: image = utils.read_image(dataset_dict["file_name"], format=self.img_format) except Exception as e: print(dataset_dict["file_name"]) print(e) raise e try: utils.check_image_size(dataset_dict, image) except SizeMismatchError as e: expected_wh = (dataset_dict["width"], dataset_dict["height"]) image_wh = (image.shape[1], image.shape[0]) if (image_wh[1], image_wh[0]) == expected_wh: print("transposing image {}".format(dataset_dict["file_name"])) image = image.transpose(1, 0, 2) else: raise e if "annotations" not in dataset_dict or len( dataset_dict["annotations"]) == 0: image, transforms = T.apply_augmentations( ([self.crop] if self.crop else []) + self.augmentation, image) else: # Crop around an instance if there are instances in the image. # USER: Remove if you don't use cropping if self.crop: crop_tfm = gen_crop_transform_with_instance( self.crop.get_crop_size(image.shape[:2]), image.shape[:2], dataset_dict["annotations"], crop_box=self.crop_box, ) image = crop_tfm.apply_image(image) try: image, transforms = T.apply_augmentations( self.augmentation, image) except ValueError as e: print(dataset_dict["file_name"]) raise e if self.crop: transforms = crop_tfm + transforms image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor( np.ascontiguousarray(image.transpose(2, 0, 1))) # USER: Remove if you don't use pre-computed proposals. # Most users would not need this feature. if self.load_proposals: utils.transform_proposals( dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk, min_box_size=self.proposal_min_box_size, ) if not self.is_train: dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) dataset_dict.pop("pano_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: if not self.mask_on: anno.pop("segmentation", None) if not self.keypoint_on: anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ transform_instance_annotations( obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] instances = annotations_to_instances(annos, image_shape, mask_format=self.mask_format) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to # the intersection of original bounding box and the cropping box. if self.crop and instances.has("gt_masks"): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() dataset_dict["instances"] = utils.filter_empty_instances(instances) # USER: Remove if you don't do semantic/panoptic segmentation. if "sem_seg_file_name" in dataset_dict: sem_seg_gt = utils.read_image( dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) dataset_dict["sem_seg"] = sem_seg_gt if self.basis_loss_on and self.is_train: # load basis supervisions if self.ann_set == "coco": basis_sem_path = dataset_dict["file_name"].replace( 'train2017', 'thing_train2017').replace('image/train', 'thing_train') else: basis_sem_path = dataset_dict["file_name"].replace( 'coco', 'lvis').replace('train2017', 'thing_train') # change extension to npz basis_sem_path = osp.splitext(basis_sem_path)[0] + ".npz" basis_sem_gt = np.load(basis_sem_path)["mask"] basis_sem_gt = transforms.apply_segmentation(basis_sem_gt) basis_sem_gt = torch.as_tensor(basis_sem_gt.astype("long")) dataset_dict["basis_sem"] = basis_sem_gt return dataset_dict