def preprocess_training_image(self, batched_inputs): images = [x["image"].to(self.device) for x in batched_inputs] images_norm = [self.normalizer(x) for x in images] images_norm = ImageList.from_tensors(images_norm, self.backbone.size_divisibility) # get shape infomation unpadded_im_shape = [x.shape[1] for x in images] ori_im_shape = [x["height"] for x in batched_inputs] # build padded_mask for ignore bottom edge and extra padded edge padded_mask = [ images[0].new_ones(x.shape[1], x.shape[2], dtype=torch.float) for x in images ] # padding, downsampling upsampled images and transforming to LAB space images = ImageList.from_tensors(images, self.backbone.size_divisibility) downsampled_images = F.avg_pool2d(images.tensor.float(), kernel_size=self.mask_out_stride, stride=self.mask_out_stride, padding=0) lab_images = torch.stack( [self.bgr_to_lab(x) for x in downsampled_images]) # Mask out the bottom area where the COCO dataset probably has wrong annotations # This trick is adopted by adelaidet's boxinst codebase. # In fact, this trick has no influence to final result. for i in range(len(padded_mask)): pixels_removed = int(self.bottom_pixels_removed * unpadded_im_shape[i] / ori_im_shape[i]) if pixels_removed > 0: padded_mask[i][-pixels_removed:, :] = 0 padded_mask = ImageList.from_tensors(padded_mask, self.backbone.size_divisibility) padded_mask = padded_mask.tensor.unsqueeze(1) # B,H,W -> B,1,H,W return images_norm, lab_images, padded_mask
def test_imagelist_padding_shape(self): ret = ImageList.from_tensors( [torch.ones((3, 15, 20), dtype=torch.float32)], 4).tensor self.assertEqual(list(ret.shape), [1, 3, 16, 20], str(ret.shape)) ret = ImageList.from_tensors([ torch.ones((3, 25, 20), dtype=torch.float32), torch.ones((3, 10, 10), dtype=torch.float32), ], 4).tensor self.assertEqual(list(ret.shape), [2, 3, 28, 20], str(ret.shape))
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor of the output resolution that represents the per-pixel segmentation prediction. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, self.backbone.size_divisibility, pad_value=self.sem_seg_head.ignore_value).tensor else: targets = None if self.training: _, losses = self.sem_seg_head(features, targets) return losses else: results, _ = self.sem_seg_head(features, images.tensor) processed_results = [] for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(result, image_size, height, width) processed_results.append({"sem_seg": r}) return processed_results
def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device): """ See get_caffe2_inputs() below. """ assert all(isinstance(x, dict) for x in batched_inputs) assert all(x["image"].dim() == 3 for x in batched_inputs) images = [x["image"] for x in batched_inputs] images = ImageList.from_tensors(images, size_divisibility) im_info = [] for input_per_image, image_size in zip(batched_inputs, images.image_sizes): target_height = input_per_image.get("height", image_size[0]) target_width = input_per_image.get("width", image_size[1]) # noqa # NOTE: The scale inside im_info is kept as convention and for providing # post-processing information if further processing is needed. For # current Caffe2 model definitions that don't include post-processing inside # the model, this number is not used. # NOTE: There can be a slight difference between width and height # scales, using a single number can results in numerical difference # compared with D2's post-processing. scale = target_height / image_size[0] im_info.append([image_size[0], image_size[1], scale]) im_info = torch.Tensor(im_info) return images.tensor.to(device), im_info.to(device)
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].float().to(self.device) for x in batched_inputs] images = [self.normalizer(x.div(255)) for x in images] images = ImageList.from_tensors(images, self.network.size_divisibility) return images
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def test_rpn_proposals_inf(self): N, Hi, Wi, A = 3, 3, 3, 3 images = ImageList.from_tensors( [torch.rand(3, 20, 30) for i in range(N)]) proposals = [torch.rand(N, Hi * Wi * A, 4)] pred_logits = [torch.rand(N, Hi * Wi * A)] pred_logits[0][1][3:5].fill_(float("inf")) find_top_rpn_proposals(proposals, pred_logits, images, 0.5, "normal", 1000, 1000, 0, False)
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x['image'].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].float().to(self.device) for x in batched_inputs] images = [self.normalizer(img) for img in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) images_whwh = list() for bi in batched_inputs: h, w = bi["image"].shape[-2:] images_whwh.append( torch.tensor([w, h, w, h], dtype=torch.float32, device=self.device)) images_whwh = torch.stack(images_whwh) return images, images_whwh
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. sem_seg: semantic segmentation ground truth Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor of the output resolution that represents the per-pixel segmentation prediction. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) # step_rate: a float, calculated by current_step/total_step, # This parameter is used for Scheduled Drop Path. step_rate = self.iter * 1.0 / self.max_iter self.iter += 1 features, expt_flops, real_flops = self.backbone( images.tensor, step_rate) if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, self.backbone.size_divisibility, False, self.sem_seg_head.ignore_value).tensor else: targets = None results, losses = self.sem_seg_head(features, targets) # calculate flops real_flops += self.sem_seg_head.flops # remove grad, avoid adding flops to the loss sum real_flops = real_flops.detach().requires_grad_(False) expt_flops = expt_flops.detach().requires_grad_(False) flops = {'real_flops': real_flops, 'expt_flops': expt_flops} # use budget constraint for training if self.training: if self.constrain_on and step_rate >= self.unupdate_rate: warm_up_rate = min(1.0, (step_rate - self.unupdate_rate) / 0.02) loss_budget = self.budget_constrint(expt_flops, warm_up_rate=warm_up_rate) losses.update({'loss_budget': loss_budget}) losses.update(flops) return losses processed_results = [] for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(result, image_size, height, width) processed_results.append({"sem_seg": r, "flops": flops}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": Instances * "sem_seg": semantic segmentation ground truth. * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`. See the return value of :func:`combine_semantic_and_instance_outputs` for its format. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "proposals" in batched_inputs[0]: proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, pad_value=self.sem_seg_head.ignore_value).tensor else: gt_sem_seg = None sem_seg_results, sem_seg_losses = self.sem_seg_head( features, gt_sem_seg) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] else: gt_instances = None if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) detector_results, detector_losses = self.roi_heads( images, features, proposals, gt_instances) if self.training: losses = {} losses.update(sem_seg_losses) losses.update({ k: v * self.instance_loss_weight for k, v in detector_losses.items() }) losses.update(proposal_losses) return losses processed_results = [] for sem_seg_result, detector_result, input_per_image, image_size in zip( sem_seg_results, detector_results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) detector_r = detector_postprocess(detector_result, height, width) processed_results.append({ "sem_seg": sem_seg_r, "instances": detector_r }) if self.combine_on: panoptic_r = combine_semantic_and_instance_outputs( detector_r, sem_seg_r.argmax(dim=0), self.combine_overlap_threshold, self.combine_stuff_area_limit, self.combine_instances_confidence_threshold, ) processed_results[-1]["panoptic_seg"] = panoptic_r return processed_results
def preprocess_image(self, batched_inputs, training): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] bs = len(images) images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, size_divisibility=0, pad_ref_long=True) # sync image size for all gpus comm.synchronize() if training and self.iter % self.change_iter == 0: if self.iter < self.max_iter - 20000: meg = torch.LongTensor(1).to(self.device) comm.synchronize() if comm.is_main_process(): size = np.random.choice(self.multi_size) meg.fill_(size) if comm.get_world_size() > 1: comm.synchronize() dist.broadcast(meg, 0) self.size = meg.item() comm.synchronize() else: self.size = 608 if training: # resize image inputs modes = ['bilinear', 'nearest', 'bicubic', 'area'] mode = modes[random.randrange(4)] if mode == 'bilinear' or mode == 'bicubic': images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode, align_corners=False) else: images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None targets = [ torch.cat([ instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor ], dim=-1) for instance in gt_instances ] labels = torch.zeros((bs, 100, 5)) for i, target in enumerate(targets): labels[i][:target.shape[0]] = target labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size else: labels = None self.iter += 1 return images, labels