def _postprocess(self, instances, proposals, batched_inputs, image_sizes): """ Rescale the output instances to the target size. """ # note: private function; subject to changes processed_results = [] for results_per_image, proposal_per_image, input_per_image, image_size in zip( instances, proposals, batched_inputs, image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append( {"instances": r, "proposals": detector_postprocess(proposal_per_image, height, width)}) return processed_results
def __call__(self, inputs, tensor_inputs, tensor_outputs): results_per_image = self.outputs_schema(tensor_outputs) assert len(inputs) == 1, "only support single batch" width, height = inputs[0]["width"], inputs[0]["height"] r = detector_postprocess(results_per_image, height, width) return [{"instances": r}]
def visualize_training(self, batched_inputs, results): """ A function used to visualize ground truth images and final network predictions. It shows ground truth bounding boxes on the original image and up to 20 predicted object bounding boxes on the original image. Args: batched_inputs (list): a list that contains input to the model. results (List[Instances]): a list of #images elements. """ from detectron2.utils.visualizer import Visualizer assert len(batched_inputs) == len( results ), "Cannot visualize inputs and results of different sizes" storage = get_event_storage() max_boxes = 20 image_index = 0 # only visualize a single image img = batched_inputs[image_index]["image"] img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) v_gt = Visualizer(img, None) v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes) anno_img = v_gt.get_image() processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1]) predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy() v_pred = Visualizer(img, None) v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes]) prop_img = v_pred.get_image() vis_img = np.vstack((anno_img, prop_img)) vis_img = vis_img.transpose(2, 0, 1) vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results" storage.put_image(vis_name, vis_img)
def postprocess(instances, batched_inputs): processed_results = [] for results_per_image, input_per_image in zip(instances, batched_inputs): r = detector_postprocess(results_per_image.to('cpu'), 300, 300) processed_results.append({"instances": r}) return processed_results
def f(batched_inputs, c2_inputs, c2_results): image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] detector_results = assemble_rcnn_outputs_by_name( image_sizes, c2_results, force_mask_on=True ) sem_seg_results = c2_results["sem_seg"] # copied from meta_arch/panoptic_fpn.py ... processed_results = [] for sem_seg_result, detector_result, input_per_image, image_size in zip( sem_seg_results, detector_results, batched_inputs, image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) detector_r = detector_postprocess(detector_result, height, width) processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r}) if combine_on: panoptic_r = combine_semantic_and_instance_outputs( detector_r, sem_seg_r.argmax(dim=0), combine_overlap_threshold, combine_stuff_area_limit, combine_instances_confidence_threshold, ) processed_results[-1]["panoptic_seg"] = panoptic_r return processed_results
def postprocess(self, outputs, images, image_ids, to_cpu): frames = [] for instances, image, image_id in zip(outputs, images, image_ids): height, width = image.shape[:2] instances = detector_postprocess(instances, height, width) type_valid = [ self.model_meta.thing_classes[pred_class] in TYPE_MAPPING for pred_class in instances.pred_classes] instances = instances[type_valid] instances.pred_classes = torch.as_tensor([ TYPE_MAPPING[self.model_meta.thing_classes[pred_class]] for pred_class in instances.pred_classes]) if len(instances) > 0: nms_mapping = torch.as_tensor([ NMS_MAPPING[pred_class.item()] for pred_class in instances.pred_classes], dtype=torch.int, device=self.device) nms_types = nms_mapping[:, 0] nms_scores = instances.scores + nms_mapping[:, 1] keep_indices = batched_nms( instances.pred_boxes.tensor, nms_scores, nms_types, self.nms_threshold) instances = instances[keep_indices] features = instances.roi_features.mean(dim=(2, 3)) features = features / features.norm(dim=1, keepdim=True) instances.roi_features = features if to_cpu: instances = instances.to('cpu') frame = Frame(image_id, image, instances) frames.append(frame) return frames
def assemble(self, batched_inputs, c2_inputs, c2_results): c2_results = {k: torch.Tensor(v) for k, v in c2_results.items()} image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] num_features = len( [x for x in c2_results.keys() if x.startswith("box_cls_")]) box_cls = [ c2_results["box_cls_{}".format(i)] for i in range(num_features) ] box_delta = [ c2_results["box_delta_{}".format(i)] for i in range(num_features) ] # For each feature level, feature should have the same batch size and # spatial dimension as the box_cls and box_delta. dummy_features = [ box_delta[i].clone()[:, 0:0, :, :] for i in range(num_features) ] anchors = self.anchor_generator(dummy_features) # self.num_classess can be inferred self.num_classes = box_cls[0].shape[1] // (box_delta[0].shape[1] // 4) results = self.inference(box_cls, box_delta, anchors, image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def det_inference( self, batched_inputs, detected_instances=None, do_postprocess=True ): """ Run inference on the given inputs. Args: batched_inputs (list[dict]): same as in :meth:`forward` detected_instances (None or list[Instances]): if not None, it contains an `Instances` object per image. The `Instances` object contains "pred_boxes" and "pred_classes" which are known boxes in the image. The inference will then skip the detection of bounding boxes, and only predict other per-ROI outputs. do_postprocess (bool): whether to apply post-processing on the outputs. Returns: same as in :meth:`forward`. """ assert not self.training images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) if detected_instances is None: if self.proposal_generator: proposals, _ = self.proposal_generator(images, features, None) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] results, others = self.roi_heads(images, features, proposals, None) if isinstance(others, tuple): others, box_features = others else: box_features = None else: detected_instances = [ x.to(self.device) for x in detected_instances ] results = self.roi_heads.forward_with_given_boxes( features, detected_instances ) box_features = None if do_postprocess: processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results, box_features else: return results, box_features
def run_detector(raw_image,predictor,num_objects=100,verbose=True): with torch.no_grad(): raw_height, raw_width = raw_image.shape[:2] if verbose: tqdm.write("Original image size: " + str((raw_height, raw_width))) # Preprocessing image = predictor.transform_gen.get_transform(raw_image).apply_image(raw_image) if verbose: tqdm.write("Transformed image size: "+str(image.shape[:2])) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) inputs = [{"image": image, "height": raw_height, "width": raw_width}] images = predictor.model.preprocess_image(inputs) # Run Backbone Res1-Res4 features = predictor.model.backbone(images.tensor) # Generate proposals with RPN proposals, _ = predictor.model.proposal_generator(images, features, None) proposal = proposals[0] if verbose: tqdm.write('Proposal Boxes size: ' + str(proposal.proposal_boxes.tensor.shape)) # Run RoI head for each proposal (RoI Pooling + Res5) proposal_boxes = [x.proposal_boxes for x in proposals] features = [features[f] for f in predictor.model.roi_heads.in_features] box_features = predictor.model.roi_heads._shared_roi_transform( features, proposal_boxes ) feature_pooled = box_features.mean(dim=[2, 3]) # pooled to 1x1 if verbose: tqdm.write('Pooled features size: ' + str(feature_pooled.shape)) # Predict classes and boxes for each proposal. pred_class_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(feature_pooled) outputs = FastRCNNOutputs( predictor.model.roi_heads.box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, predictor.model.roi_heads.smooth_l1_beta, ) probs = outputs.predict_probs()[0] boxes = outputs.predict_boxes()[0] # Note: BUTD uses raw RoI predictions, # we use the predicted boxes instead. # boxes = proposal_boxes[0].tensor # NMS for nms_thresh in np.arange(0.5, 1.0, 0.1): instances, ids = fast_rcnn_inference_single_image( boxes, probs, image.shape[1:], score_thresh=0.2, nms_thresh=nms_thresh, topk_per_image=num_objects ) if len(ids) == num_objects: break instances = detector_postprocess(instances, raw_height, raw_width) roi_features = feature_pooled[ids].detach() if verbose: tqdm.write(str(instances)) return instances, roi_features
def eval_wrapper(inputs): """ The exported model does not contain the final resize step, which is typically unused in deployment but needed for evaluation. We add it manually here. """ input = inputs[0] instances = traceable_model.outputs_schema(ts_model(input["image"]))[0]["instances"] postprocessed = detector_postprocess(instances, input["height"], input["width"]) return [{"instances": postprocessed}]
def forward(self, batched_inputs: Tuple[Dict[str, Tensor]]): """ Args: batched_inputs(list): batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: """ images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor)[self.head_in_features] features = self.upsample(features) pred_dict = self.head(features) if self.training: assert not torch.jit.is_scripting(), "Not supported" assert "instances" in batched_inputs[ 0], "Instance annotations are missing in training!" gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] gt_dict = self.gt_generator(gt_instances, images.tensor.shape) losses = self.losses(pred_dict, gt_dict) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference(pred_dict['pred_hm'], pred_dict['pred_wh'], pred_dict['pred_reg'], images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(pred_dict['pred_hm'], pred_dict['pred_wh'], pred_dict['pred_reg'], images.image_sizes) if torch.jit.is_scripting(): return results processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def post_processing(self, batched_inputs, results, image_sizes): processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.head(features) anchors = self.anchor_generator(features) if self.training: gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances) losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] anchors = self.anchor_generator(features) pred_logits, pred_anchor_deltas = self.head(features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits] pred_anchor_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas] if self.training: assert "instances" in batched_inputs[0], "Instance annotations are missing in training!" gt_instances = [x["instances"].to(self.device) for x in batched_inputs] gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) losses = self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference( anchors, pred_logits, pred_anchor_deltas, images.image_sizes ) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(anchors, pred_logits, pred_anchor_deltas, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def convert_outputs(self, batched_inputs, inputs, results): results = self._wrapped_model.inference(results, inputs['im_info']) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, inputs['im_info']): original_height = input_per_image.get("height", image_size[0]) original_width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, original_height, original_width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None masks = { key: ImageList.from_tensors([x[key] for x in batched_inputs], self.backbone.size_divisibility) for key in self.masks } proposals, proposal_losses = self.proposal_generator( images, features, gt_instances, **masks) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def __call__(self, tracing_adapter_wrapper, batch): """ This function describes how to run the predictor using exported model. Note that `tracing_adapter_wrapper` runs the traced model under the hood and behaves exactly the same as the forward function. """ assert len(batch) == 1, "only support single batch" width, height = batch[0]["width"], batch[0]["height"] inputs = D2RCNNTracingWrapper.generator_trace_inputs(batch) results_per_image = tracing_adapter_wrapper(inputs) r = detector_postprocess(results_per_image, height, width) return [{"instances": r}]
def inference(self, batched_inputs): images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) if self.proposal_generator: proposals, _ = self.proposal_generator(images, features) else: raise NotImplementedError detector_results, pan_detector_results = self.roi_heads( images, features, proposals) sem_seg_results, _ = self.sem_seg_head(features) pan_seg_results, _ = self.panoptic_head(None, sem_seg_results, pan_detector_results) processed_results = [] for sem_seg_result, detector_result, pan_seg_result, input_per_image, image_size in zip( sem_seg_results, detector_results, pan_seg_results, batched_inputs, images.image_sizes): processed_result = {} height = input_per_image.get("height") width = input_per_image.get("width") sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) detector_r = detector_postprocess(detector_result, height, width) processed_result.update({ "sem_seg": sem_seg_r, "instances": detector_r }) if self.combine_on: panoptic_r = combine_semantic_and_instance_outputs( detector_r, sem_seg_r.argmax(dim=0), self.combine_overlap_threshold, self.combine_stuff_area_limit, self.combine_instances_confidence_threshold, ) else: pan_pred = sem_seg_postprocess(pan_seg_result["pan_logit"], image_size, height, width) del pan_seg_result["pan_logit"] pan_seg_result["pan_pred"] = pan_pred.argmax(dim=0) panoptic_r = pan_seg_postprocess(pan_seg_result, sem_seg_r.argmax(dim=0), self.stuff_num_classes, self.stuff_area_limit) processed_result.update({"panoptic_seg": panoptic_r}) processed_results.append(processed_result) return processed_results
def forward(self, batched_inputs: Tuple[Dict[str, Tensor]]): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: in training, dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. in inference, the standard output format, described in :doc:`/tutorials/models`. """ #import pickle #f=open('test.pkl','rb') #batched_inputs = pickle.load(f) #f.close() num_images = len(batched_inputs) #for x in batched_inputs: # print(x["image"].size()) images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[self.backbone_level]] anchors_image = self.anchor_generator(features) anchors = [copy.deepcopy(anchors_image) for _ in range(num_images)] pred_logits, pred_anchor_deltas = self.decoder( self.encoder(features[0])) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [permute_to_N_HWA_K(pred_logits, self.num_classes)] pred_anchor_deltas = [permute_to_N_HWA_K(pred_anchor_deltas, 4)] results = self.inference(anchors_image, pred_logits, pred_anchor_deltas, images.image_sizes) if torch.jit.is_scripting(): return results processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def _postprocess(instances, batched_inputs: Tuple[Dict[str, torch.Tensor]], image_sizes): """ Rescale the output instances to the target size. """ # note: private function; subject to changes processed_results = [] for results_per_image, input_per_image, image_size in zip( instances, batched_inputs, image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def convert_outputs(self, batched_inputs, inputs, results): output_names = self.get_output_names() assert len(results) == len(output_names) results = self._ns.inference(results, inputs['im_info']) from detectron2.modeling.postprocessing import detector_postprocess processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, inputs['im_info']): original_height = input_per_image.get("height", image_size[0]) original_width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, original_height, original_width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of : class:`DatasetMapper`. Each item in the list contains the input for one image. For now, each item in the list is a dict that contains: * images: Tensor, image in (C, H, W) format. * instances: Instances. Other information that' s included in the original dict ,such as: * "height", "width"(int): the output resolution of the model, used in inference.See `postprocess` for detail Return: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss, Used during training only. At inference stage, return predicted bboxes. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n(logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] cls_outs, pts_outs_init, pts_outs_refine = self.head(features) center_pts = self.shift_generator(features) if self.training: return self.losses(center_pts, cls_outs, pts_outs_init, pts_outs_refine, gt_instances) else: results = self.inference(center_pts, cls_outs, pts_outs_init, pts_outs_refine, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def inference(self, batched_inputs): assert not self.training images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[f] for f in self.head.in_features] results = self.head(images, features) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def _test_retinanet_model(self, config_path): model = model_zoo.get(config_path, trained=True) model.eval() fields = { "pred_boxes": Boxes, "scores": Tensor, "pred_classes": Tensor, } script_model = export_torchscript_with_instances(model, fields) img = get_sample_coco_image() inputs = [{"image": img}] with torch.no_grad(): instance = model(inputs)[0]["instances"] scripted_instance = convert_scripted_instances(script_model(inputs)[0]) scripted_instance = detector_postprocess(scripted_instance, img.shape[1], img.shape[2]) assert_instances_allclose(instance, scripted_instance)
def _inference_one_image(self, input): """ Args: input (dict): one dataset dict with "image" field being a CHW tensor Returns: dict: one output dict """ orig_shape = (input["height"], input["width"]) # For some reason, resize with uint8 slightly increases box AP but decreases densepose AP input["image"] = input["image"].to(torch.uint8) augmented_inputs, tfms = self._get_augmented_inputs(input) # Detect boxes from all augmented versions with self._turn_off_roi_heads( ["mask_on", "keypoint_on", "densepose_on"]): # temporarily disable roi heads all_boxes, all_scores, all_classes = self._get_augmented_boxes( augmented_inputs, tfms) merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape) if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON: # Use the detected boxes to obtain new fields augmented_instances = self._rescale_detected_boxes( augmented_inputs, merged_instances, tfms) # run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances) # Delete now useless variables to avoid being out of memory del augmented_inputs, augmented_instances # average the predictions if self.cfg.MODEL.MASK_ON: merged_instances.pred_masks = self._reduce_pred_masks( outputs, tfms) if self.cfg.MODEL.DENSEPOSE_ON: merged_instances.pred_densepose = self._reduce_pred_densepose( outputs, tfms) # postprocess merged_instances = detector_postprocess(merged_instances, *orig_shape) return {"instances": merged_instances} else: return {"instances": merged_instances}
def postprocess(self, outputs, images, image_ids, to_cpu): frames = [] for instances, image, image_id in zip(outputs, images, image_ids): height, width = image.shape[:2] instances = detector_postprocess(instances, height, width) obj_types = [self.model_meta.thing_classes[pred_class] for pred_class in instances.pred_classes] type_valid = [obj_type in TYPE_MAPPING for obj_type in obj_types] instances = instances[type_valid] features = instances.roi_features.mean(dim=(2, 3)) features = features / features.norm(dim=1, keepdim=True) instances.roi_features = features instances.pred_classes = torch.as_tensor([ TYPE_MAPPING[self.model_meta.thing_classes[pred_class]] for pred_class in instances.pred_classes]) if to_cpu: instances = instances.to('cpu') frame = Frame(image_id, image, instances) frames.append(frame) return frames
def __call__(self, batch, inputs, outputs): """ This function describes how to run the predictor using exported model. Note that `tracing_adapter_wrapper` runs the traced model under the hood and behaves exactly the same as the forward function. """ assert len(batch) == 1, "only support single batch" width, height = batch[0]["width"], batch[0]["height"] if self.detector_postprocess_done_in_model: image_shape = batch[0]["image"].shape # chw if image_shape[1] != height or image_shape[2] != width: raise NotImplementedError( f"Image tensor (shape: {image_shape}) doesn't match the" f" input width ({width}) height ({height}). Since post-process" f" has been done inside the torchscript without width/height" f" information, can't recover the post-processed output to " f"orignail resolution.") return [{"instances": outputs}] else: r = detector_postprocess(outputs, height, width) return [{"instances": r}]
def convert_outputs(self, batched_inputs, inputs, results): image_sizes = inputs["image_sizes"] m_results = [Instances(image_size) for image_size in image_sizes] proposal_boxes = results["proposal_boxes"] for i in range(len(batched_inputs)): indices = (proposal_boxes[:, 0] == i).nonzero(as_tuple=True) proposals = proposal_boxes[indices][:, 1:] m_results[i].proposal_boxes = Boxes(proposals) m_results[i].objectness_logits = torch.linspace( 10, -10, steps=proposals.size(0), device=proposals.device) # postprocess processed_results = [] for results_per_image, input_per_image, image_size in zip( m_results, batched_inputs, image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def _postprocess( self, batched_inputs: Sequence[Dict[str, Any]], images_sizes: Sequence[Tuple[int, int]], detector_results: Sequence[Optional[Instances]], unsupervised_results: Sequence[Optional[torch.Tensor]], ) -> List[Dict[str, Any]]: batched_inputs n_inputs = len(batched_inputs) if n_inputs != len(images_sizes): raise ValueError(f"length mismatch; {n_inputs=} but {len(images_sizes)=}") if n_inputs != len(detector_results): raise ValueError( f"length mismatch; {n_inputs=} but {len(detector_results)=}" ) if n_inputs != len(unsupervised_results): raise ValueError( f"length mismatch; {n_inputs=} but {len(unsupervised_results)=}" ) results: List[Dict[str, Any]] = [{} for _ in range(n_inputs)] for i in range(len(batched_inputs)): image_input = batched_inputs[i] image_size = images_sizes[i] image_instances = detector_results[i] image_unsup_result = unsupervised_results[i] h: int = image_input.get("height", image_size[0]) w: int = image_input.get("width", image_size[1]) if image_instances is not None: r = detector_postprocess(image_instances, h, w) results[i]["instances"] = r if image_unsup_result is not None: u = self.unsupervised_head.postprocess( image_unsup_result, image_size, h, w ) results[i]["unsupervised"] = u return results
def postprocess(self, instances, batched_inputs, image_sizes): """ Rescale the output instances to the target size. """ # note: private function; subject to changes processed_results = [] for results_per_image, input_per_image, image_size in zip( instances, batched_inputs, image_sizes): boxes = results_per_image.pred_boxes.tensor scores = results_per_image.scores class_idxs = results_per_image.pred_classes # Apply per-class nms for each image keep = batched_nms(boxes, scores, class_idxs, self.nms_thresh) keep = keep[:self.max_detections_per_image] results_per_image = results_per_image[keep] height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results