def forward(self, batched_inputs): if not self.training and not self.visualize_path: return self.single_test(batched_inputs) images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.refinement_head.ignore_value).tensor else: gt_sem_seg = None proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) edge_map, head_losses, proposals = self.refinement_head( features, proposals, (gt_sem_seg, [gt_instances, images.image_sizes])) # In training, the proposals are not useful at all in RPN models; but not here # This makes RPN-only models about 5% slower. if self.training: proposal_losses.update(head_losses) return proposal_losses processed_results = [] for per_edge_map, results_per_image, input_per_image, image_size in zip( edge_map, proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) edge_map_r = edge_map_postprocess(per_edge_map, image_size) instance_r = detector_postprocess(proposals[0], height, width) processed_results.append( { "instances": instance_r, "edge_map": edge_map_r }, ) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.head(features) anchors = self.anchor_generator(features) if self.training: gt_classes, gt_anchors_reg_deltas = self.get_ground_truth( anchors, gt_instances) mask = self.get_picky_ground_truth(anchors, gt_instances) return images.tensor, {"pred_class_logits": box_cls, "pred_proposal_deltas": box_delta}, gt_classes, mask, \ self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) else: results = self.inference(box_cls, box_delta, anchors, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return images.tensor, { "pred_class_logits": box_cls, "pred_proposal_deltas": box_delta }, None, processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains two key instances whose value is a :class:`Instances`. * "box_instances" has the following keys: "pred_boxes", "pred_classes", "scores" * "hoi_instances" has the following keys: "person_boxes", "object_boxes", "object_classes", "action_classes", "scores" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None if "sem_seg" in batched_inputs[0] and self.cfg.MODEL.ROI_DENSEPOSE_HEAD.SEMSEG_ON: extra = self.preprocess_semseg_image(batched_inputs) #[x["sem_seg"] for x in batched_inputs] else: extra = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances, extra) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def inference_single(self, batched_inputs, do_postprocess=True): images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) if self._eval_gt_box: if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10, ) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None for inst in gt_instances: inst.proposal_boxes = inst.gt_boxes inst.objectness_logits = torch.ones(len(inst.gt_boxes)) proposals = gt_instances else: if self.proposal_generator: proposals, _ = self.proposal_generator(images, features, None) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] pred_depth = [None] * len(proposals) if self.depth_head_on: if "depth_head" not in self._freeze: pred_depth = self.depth_head(features, None) results, _ = self.roi_heads(images, features, proposals, None) if self.camera_on: rtn_feature = features else: rtn_feature = None if do_postprocess: return ( SiamesePlaneRCNN._postprocess( results, batched_inputs, images.image_sizes, mask_threshold=self.mask_threshold, nms=self.nms, ), pred_depth, rtn_feature, ) else: return results, pred_depth, rtn_feature
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DetectionTransform` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. instances: Instances Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: losses (dict[str: Tensor]): mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) # ins branch ins_features = [features[f] for f in self.instance_in_features] ins_features = self.split_feats(ins_features) cate_pred, kernel_pred = self.ins_head(ins_features) # mask branch mask_features = [features[f] for f in self.mask_in_features] mask_pred = self.mask_head(mask_features) if self.training: """ get_ground_truth. return loss and so on. """ mask_feat_size = mask_pred.size()[-2:] targets = self.get_ground_truth(gt_instances, mask_feat_size) losses = self.loss(cate_pred, kernel_pred, mask_pred, targets) return losses else: # point nms. cate_pred = [self.point_nms(cate_p.sigmoid(), kernel=2).permute(0, 2, 3, 1) for cate_p in cate_pred] # do inference for results. results = self.inference(cate_pred, kernel_pred, mask_pred, images.image_sizes, batched_inputs) return results
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None masks = { key: ImageList.from_tensors([x[key] for x in batched_inputs], self.backbone.size_divisibility) for key in self.masks } proposals, proposal_losses = self.proposal_generator( images, features, gt_instances, **masks) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) # [0,1] histogram = np.zeros((num_classes, ), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] # iscrowd 为0 的才有效 classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)] # np.histogram(a, bin:int or list, range:tuple..) # a: 待统计数组,bin可以是int,那么要指定range,或者list表示bin的范围, # return hist:统计好的每个区间的数,bin:区间的划分 histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) # 一个图cols? def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x # 改短名字 data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) # data = [name1, num1, name2, v2, ...] total_num_instances = sum( data[1::2]) # data[1::2] a[i:j:step]表示从index = 1 开始切片,直到末尾,step = 2 data.extend([None] * (N_COLS - (len(data) % N_COLS))) # 补齐为N_COLS 的倍数 if num_classes > 1: data.extend(["total", total_num_instances]) #data[i::N_COLS],为什么要搞出这么多个呢(6个) # zip_longest, 跟zip差不多,给定多个列表,a,b,.. ,按照最长的那个枚举,不足的填None data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) # 不知道干啥。。。 table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes, ), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] classes = np.asarray( [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=np.int) if len(classes): assert classes.min( ) >= 0, f"Got an invalid category_id={classes.min()}" assert ( classes.max() < num_classes ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes" histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def __getattr__(self, key): if key in self._RENAMED: log_first_n( logging.WARNING, "Metadata '{}' was renamed to '{}'!".format( key, self._RENAMED[key]), n=10, ) return getattr(self, self._RENAMED[key]) raise AttributeError( "Attribute '{}' does not exist in the metadata of '{}'. Available keys are {}." .format(key, self.name, str(self.__dict__.keys())))
def forward(self, batched_inputs, c_iter, max_iter): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features_list = [features[f] for f in self.in_features] box_cls, box_regression, centerness = self.head(features_list) locations = self.compute_locations(features_list) if self.training: return self._forward_train(features_list, locations, box_cls, box_regression, centerness, gt_instances, batched_inputs, images, c_iter, max_iter) else: #assert False return self._forward_test(features_list, locations, box_cls, box_regression, centerness, batched_inputs, images)
def forward(self, batched_inputs): if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) # print(batched_inputs[1]) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] if "associations" in batched_inputs[0]: gt_associations = [ x["associations"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.association_proposal_generator: association_proposals, association_losses, pre_features, pre_proposals = self.association_proposal_generator( images, features, gt_associations) if self.proposal_generator: # concat_features = {} # for pre_feature, (k,v) in zip(pre_features,features.items()): # concat_features[k] = torch.cat([v,pre_feature],1) proposals, proposal_losses = self.proposal_generator( images, features, gt_instances, pre_proposals) _, detector_losses = self.roi_heads(images, features, association_proposals, proposals, gt_associations, gt_instances) losses = {} losses.update(detector_losses) losses.update(proposal_losses) losses.update(association_losses) return losses
def _parse_arch_def(cfg): arch = cfg.MODEL.FBNET_V2.ARCH arch_def = cfg.MODEL.FBNET_V2.ARCH_DEF assert (arch != "" and not arch_def) ^ (not arch and arch_def != []), ( "Only allow one unset node between MODEL.FBNET_V2.ARCH ({}) and MODEL.FBNET_V2.ARCH_DEF ({})" .format(arch, arch_def)) arch_def = FBNetV2ModelArch.get(arch) if arch else _merge_fbnetv2_arch_def( cfg) # NOTE: arch_def is a dictionary describing the CNN architecture for creating # the detection model. It can describe a wide range of models including the # original FBNet. Each key-value pair expresses either a sub part of the model # like trunk or head, or stores other meta information. message = "Using un-unified arch_def for ARCH \"{}\" (without scaling):\n{}".format( arch, format_dict_expanding_list_values(arch_def)) log_first_n(logging.INFO, message, n=1, key="message") return arch_def
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of : class:`DatasetMapper`. Each item in the list contains the input for one image. For now, each item in the list is a dict that contains: * images: Tensor, image in (C, H, W) format. * instances: Instances. Other information that' s included in the original dict ,such as: * "height", "width"(int): the output resolution of the model, used in inference.See `postprocess` for detail Return: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss, Used during training only. At inference stage, return predicted bboxes. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n(logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] cls_outs, pts_outs_init, pts_outs_refine = self.head(features) center_pts = self.shift_generator(features) if self.training: return self.losses(center_pts, cls_outs, pts_outs_init, pts_outs_refine, gt_instances) else: results = self.inference(center_pts, cls_outs, pts_outs_init, pts_outs_refine, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs: torch.Tensor): if not self.training: return self.inference(batched_inputs, ) images_to_compare = self.preprocess_image(batched_inputs, norm=False) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) _, reconstruction_losses = self.reconstruct_heads( images_to_compare, features) losses = {} losses.update(detector_losses) losses.update(proposal_losses) losses.update(reconstruction_losses) return losses
def forward(self, batched_inputs): # print("you're in faster rcnn focal loss") if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) # print("lala") if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) # print("you're in faster rcnn focal loss") return losses
def forward_single(self, batched_inputs): images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10, ) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} pred_instances, detector_losses = self.roi_heads( images, features, proposals, gt_instances) depth_losses = {} if self.depth_head_on: if "depth_head" not in self._freeze: gt_depth = self.process_depth(batched_inputs) depth_losses = self.depth_head(features, gt_depth) losses = {} losses.update(detector_losses) losses.update(proposal_losses) losses.update(depth_losses) if self.camera_on: return pred_instances, losses, features else: return pred_instances, losses, None
def __setattr__(self, key, val): if key in self._RENAMED: log_first_n( logging.WARNING, "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]), n=10, ) setattr(self, self._RENAMED[key], val) # Ensure that metadata of the same name stays consistent try: oldval = getattr(self, key) assert oldval == val, ( "Attribute '{}' in the metadata of '{}' cannot be set " "to a different value!\n{} != {}".format(key, self.name, oldval, val) ) except AttributeError: super().__setattr__(key, val)
def get_mask_feature(self, batched_inputs): """ a part of self.forward function, the args meaning see above forward function :return: mask_feature , a tensor of shape(M, C, out_put_size, out_put_size) M is total number of the mask corresponding image from batched_inputs ( the batch size is 1 ) """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) mask_features = self.roi_heads.get_mask_feature( images, features, proposals, gt_instances) return mask_features else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} mask_features = self.roi_heads.get_mask_feature( images, features, proposals, gt_instances) return mask_features
def __getattr__(self, key): if key in self._RENAMED: log_first_n( logging.WARNING, "Metadata '{}' was renamed to '{}'!".format( key, self._RENAMED[key]), n=10, ) return getattr(self, self._RENAMED[key]) # "name" exists in every metadata if len(self.__dict__) > 1: raise AttributeError( "Attribute '{}' does not exist in the metadata of dataset '{}'. Available " "keys are {}.".format(key, self.name, str(self.__dict__.keys()))) else: raise AttributeError( f"Attribute '{key}' does not exist in the metadata of dataset '{self.name}': " "metadata is empty.")
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of : class:`DatasetMapper`. Each item in the list contains the input for one image. For now, each item in the list is a dict that contains: * images: Tensor, image in (C, H, W) format. * instances: Instances. Other information that' s included in the original dict ,such as: * "height", "width"(int): the output resolution of the model, used in inference.See `postprocess` for detail Return: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss, Used during training only. At inference stage, return predicted bboxes. """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n(logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.head.in_features] losses = self.head(images, features, gt_instances) return losses
def read_data(self, dataset_dict): """load image and annos random shift & scale bbox; crop, rescale.""" cfg = self.cfg r_head_cfg = cfg.MODEL.CDPN.ROT_HEAD pnp_net_cfg = cfg.MODEL.CDPN.PNP_NET dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below dataset_name = dataset_dict["dataset_name"] image = read_image_cv2(dataset_dict["file_name"], format=self.img_format) # should be consistent with the size in dataset_dict utils.check_image_size(dataset_dict, image) im_H_ori, im_W_ori = image.shape[:2] # currently only replace bg for train ############################### if self.split == "train": # some synthetic data already has bg, img_type should be real or something else but not syn img_type = dataset_dict.get("img_type", "real") if img_type == "syn": log_first_n(logging.WARNING, "replace bg", n=10) assert "segmentation" in dataset_dict["inst_infos"] mask = cocosegm2mask( dataset_dict["inst_infos"]["segmentation"], im_H_ori, im_W_ori) image, mask_trunc = self.replace_bg(image.copy(), mask, return_mask=True) else: # real image if np.random.rand() < cfg.INPUT.CHANGE_BG_PROB: log_first_n(logging.WARNING, "replace bg for real", n=10) assert "segmentation" in dataset_dict["inst_infos"] mask = cocosegm2mask( dataset_dict["inst_infos"]["segmentation"], im_H_ori, im_W_ori) image, mask_trunc = self.replace_bg(image.copy(), mask, return_mask=True) else: mask_trunc = None # NOTE: maybe add or change color augment here =================================== if self.split == "train" and self.color_aug_prob > 0 and self.color_augmentor is not None: if np.random.rand() < self.color_aug_prob: if cfg.INPUT.COLOR_AUG_SYN_ONLY and img_type not in ["real"]: image = self._color_aug(image, self.color_aug_type) else: image = self._color_aug(image, self.color_aug_type) # other transforms (mainly geometric ones); # for 6d pose task, flip is now allowed in general except for some 2d keypoints methods image, transforms = T.apply_augmentations(self.augmentation, image) im_H, im_W = image_shape = image.shape[:2] # h, w # NOTE: scale camera intrinsic if necessary ================================ scale_x = im_W / im_W_ori scale_y = im_H / im_H_ori # NOTE: generally scale_x should be equal to scale_y if "cam" in dataset_dict: if im_W != im_W_ori or im_H != im_H_ori: dataset_dict["cam"][0] *= scale_x dataset_dict["cam"][1] *= scale_y K = dataset_dict["cam"].astype("float32") dataset_dict["cam"] = torch.as_tensor(K) input_res = cfg.MODEL.CDPN.BACKBONE.INPUT_RES out_res = cfg.MODEL.CDPN.BACKBONE.OUTPUT_RES # CHW -> HWC coord_2d = get_2d_coord_np(im_W, im_H, low=0, high=1).transpose(1, 2, 0) ################################################################################# if self.split != "train": # don't load annotations at test time test_bbox_type = cfg.TEST.TEST_BBOX_TYPE if test_bbox_type == "gt": bbox_key = "bbox" else: bbox_key = f"bbox_{test_bbox_type}" assert not self.flatten, "Do not use flattened dicts for test!" # here get batched rois roi_infos = {} # yapf: disable roi_keys = ["scene_im_id", "file_name", "cam", "im_H", "im_W", "roi_img", "inst_id", "roi_coord_2d", "roi_cls", "score", "roi_extent", bbox_key, "bbox_mode", "bbox_center", "roi_wh", "scale", "resize_ratio", "model_info", ] for _key in roi_keys: roi_infos[_key] = [] # yapf: enable # TODO: how to handle image without detections # filter those when load annotations or detections, implement a function for this # "annotations" means detections for inst_i, inst_infos in enumerate(dataset_dict["annotations"]): # inherent image-level infos roi_infos["scene_im_id"].append(dataset_dict["scene_im_id"]) roi_infos["file_name"].append(dataset_dict["file_name"]) roi_infos["im_H"].append(im_H) roi_infos["im_W"].append(im_W) roi_infos["cam"].append(dataset_dict["cam"].cpu().numpy()) # roi-level infos roi_infos["inst_id"].append(inst_i) roi_infos["model_info"].append(inst_infos["model_info"]) roi_cls = inst_infos["category_id"] roi_infos["roi_cls"].append(roi_cls) roi_infos["score"].append(inst_infos["score"]) # extent roi_extent = self._get_extents(dataset_name)[roi_cls] roi_infos["roi_extent"].append(roi_extent) bbox = BoxMode.convert(inst_infos[bbox_key], inst_infos["bbox_mode"], BoxMode.XYXY_ABS) bbox = np.array(transforms.apply_box([bbox])[0]) roi_infos[bbox_key].append(bbox) roi_infos["bbox_mode"].append(BoxMode.XYXY_ABS) x1, y1, x2, y2 = bbox bbox_center = np.array([0.5 * (x1 + x2), 0.5 * (y1 + y2)]) bw = max(x2 - x1, 1) bh = max(y2 - y1, 1) scale = max(bh, bw) * cfg.INPUT.DZI_PAD_SCALE scale = min(scale, max(im_H, im_W)) * 1.0 roi_infos["bbox_center"].append(bbox_center.astype("float32")) roi_infos["scale"].append(scale) roi_infos["roi_wh"].append(np.array([bw, bh], dtype=np.float32)) roi_infos["resize_ratio"].append(out_res / scale) # CHW, float32 tensor # roi_image roi_img = crop_resize_by_warp_affine( image, bbox_center, scale, input_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) roi_img = self.normalize_image(cfg, roi_img) roi_infos["roi_img"].append(roi_img.astype("float32")) # roi_coord_2d roi_coord_2d = crop_resize_by_warp_affine( coord_2d, bbox_center, scale, out_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) # HWC -> CHW roi_infos["roi_coord_2d"].append( roi_coord_2d.astype("float32")) for _key in roi_keys: if _key in ["roi_img", "roi_coord_2d"]: dataset_dict[_key] = torch.as_tensor( roi_infos[_key]).contiguous() elif _key in ["model_info", "scene_im_id", "file_name"]: # can not convert to tensor dataset_dict[_key] = roi_infos[_key] else: dataset_dict[_key] = torch.tensor(roi_infos[_key]) return dataset_dict ####################################################################################### # NOTE: currently assume flattened dicts for train assert self.flatten, "Only support flattened dicts for train now" inst_infos = dataset_dict.pop("inst_infos") dataset_dict["roi_cls"] = roi_cls = inst_infos["category_id"] # extent roi_extent = self._get_extents(dataset_name)[roi_cls] dataset_dict["roi_extent"] = torch.tensor(roi_extent, dtype=torch.float32) # load xyz ======================================================= xyz_info = mmcv.load(inst_infos["xyz_path"]) x1, y1, x2, y2 = xyz_info["xyxy"] # float16 does not affect performance (classification/regresion) xyz_crop = xyz_info["xyz_crop"] xyz = np.zeros((im_H, im_W, 3), dtype=np.float32) xyz[y1:y2 + 1, x1:x2 + 1, :] = xyz_crop # NOTE: full mask mask_obj = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype(np.bool).astype(np.float32) if cfg.INPUT.SMOOTH_XYZ: xyz = self.smooth_xyz(xyz) if cfg.TRAIN.VIS: xyz = self.smooth_xyz(xyz) # override bbox info using xyz_infos inst_infos["bbox"] = [x1, y1, x2, y2] inst_infos["bbox_mode"] = BoxMode.XYXY_ABS # USER: Implement additional transformations if you have other types of data # inst_infos.pop("segmentation") # NOTE: use mask from xyz anno = transform_instance_annotations(inst_infos, transforms, image_shape, keypoint_hflip_indices=None) # augment bbox =================================================== bbox_xyxy = anno["bbox"] bbox_center, scale = self.aug_bbox(cfg, bbox_xyxy, im_H, im_W) bw = max(bbox_xyxy[2] - bbox_xyxy[0], 1) bh = max(bbox_xyxy[3] - bbox_xyxy[1], 1) # CHW, float32 tensor ## roi_image ------------------------------------ roi_img = crop_resize_by_warp_affine( image, bbox_center, scale, input_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) roi_img = self.normalize_image(cfg, roi_img) # roi_coord_2d ---------------------------------------------------- roi_coord_2d = crop_resize_by_warp_affine( coord_2d, bbox_center, scale, out_res, interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1) ## roi_mask --------------------------------------- # (mask_trunc < mask_visib < mask_obj) mask_visib = anno["segmentation"].astype("float32") * mask_obj if mask_trunc is None: mask_trunc = mask_visib else: mask_trunc = mask_visib * mask_trunc.astype("float32") if cfg.TRAIN.VIS: mask_xyz_interp = cv2.INTER_LINEAR else: mask_xyz_interp = cv2.INTER_NEAREST # maybe truncated mask (true mask for rgb) roi_mask_trunc = crop_resize_by_warp_affine( mask_trunc[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) # use original visible mask to calculate xyz loss (try full obj mask?) roi_mask_visib = crop_resize_by_warp_affine( mask_visib[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) roi_mask_obj = crop_resize_by_warp_affine( mask_obj[:, :, None], bbox_center, scale, out_res, interpolation=mask_xyz_interp) ## roi_xyz ---------------------------------------------------- roi_xyz = crop_resize_by_warp_affine(xyz, bbox_center, scale, out_res, interpolation=mask_xyz_interp) # region label if r_head_cfg.NUM_REGIONS > 1: fps_points = self._get_fps_points(dataset_name)[roi_cls] roi_region = xyz_to_region(roi_xyz, fps_points) # HW dataset_dict["roi_region"] = torch.as_tensor( roi_region.astype(np.int32)).contiguous() roi_xyz = roi_xyz.transpose(2, 0, 1) # HWC-->CHW # normalize xyz to [0, 1] using extent roi_xyz[0] = roi_xyz[0] / roi_extent[0] + 0.5 roi_xyz[1] = roi_xyz[1] / roi_extent[1] + 0.5 roi_xyz[2] = roi_xyz[2] / roi_extent[2] + 0.5 if ("CE" in r_head_cfg.XYZ_LOSS_TYPE) or ( "cls" in cfg.MODEL.CDPN.NAME): # convert target to int for cls # assume roi_xyz has been normalized in [0, 1] roi_xyz_bin = np.zeros_like(roi_xyz) roi_x_norm = roi_xyz[0] roi_x_norm[roi_x_norm < 0] = 0 # clip roi_x_norm[roi_x_norm > 0.999999] = 0.999999 # [0, BIN-1] roi_xyz_bin[0] = np.asarray(roi_x_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) roi_y_norm = roi_xyz[1] roi_y_norm[roi_y_norm < 0] = 0 roi_y_norm[roi_y_norm > 0.999999] = 0.999999 roi_xyz_bin[1] = np.asarray(roi_y_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) roi_z_norm = roi_xyz[2] roi_z_norm[roi_z_norm < 0] = 0 roi_z_norm[roi_z_norm > 0.999999] = 0.999999 roi_xyz_bin[2] = np.asarray(roi_z_norm * r_head_cfg.XYZ_BIN, dtype=np.uint8) # the last bin is for bg roi_masks = { "trunc": roi_mask_trunc, "visib": roi_mask_visib, "obj": roi_mask_obj } roi_mask_xyz = roi_masks[r_head_cfg.XYZ_LOSS_MASK_GT] roi_xyz_bin[0][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN roi_xyz_bin[1][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN roi_xyz_bin[2][roi_mask_xyz == 0] = r_head_cfg.XYZ_BIN if "CE" in r_head_cfg.XYZ_LOSS_TYPE: dataset_dict["roi_xyz_bin"] = torch.as_tensor( roi_xyz_bin.astype("uint8")).contiguous() if "/" in r_head_cfg.XYZ_LOSS_TYPE and len( r_head_cfg.XYZ_LOSS_TYPE.split("/")[1]) > 0: dataset_dict["roi_xyz"] = torch.as_tensor( roi_xyz.astype("float32")).contiguous() else: dataset_dict["roi_xyz"] = torch.as_tensor( roi_xyz.astype("float32")).contiguous() # pose targets ---------------------------------------------------------------------- pose = inst_infos["pose"] allo_pose = egocentric_to_allocentric(pose) quat = inst_infos["quat"] allo_quat = mat2quat(allo_pose[:3, :3]) # ====== actually not needed ========== if pnp_net_cfg.ROT_TYPE == "allo_quat": dataset_dict["allo_quat"] = torch.as_tensor( allo_quat.astype("float32")) elif pnp_net_cfg.ROT_TYPE == "ego_quat": dataset_dict["ego_quat"] = torch.as_tensor(quat.astype("float32")) # rot6d elif pnp_net_cfg.ROT_TYPE == "ego_rot6d": dataset_dict["ego_rot6d"] = torch.as_tensor( mat_to_ortho6d_np(pose[:3, :3].astype("float32"))) elif pnp_net_cfg.ROT_TYPE == "allo_rot6d": dataset_dict["allo_rot6d"] = torch.as_tensor( mat_to_ortho6d_np(allo_pose[:3, :3].astype("float32"))) # log quat elif pnp_net_cfg.ROT_TYPE == "ego_log_quat": dataset_dict["ego_log_quat"] = quaternion_lf.qlog( torch.as_tensor(quat.astype("float32"))[None])[0] elif pnp_net_cfg.ROT_TYPE == "allo_log_quat": dataset_dict["allo_log_quat"] = quaternion_lf.qlog( torch.as_tensor(allo_quat.astype("float32"))[None])[0] # lie vec elif pnp_net_cfg.ROT_TYPE == "ego_lie_vec": dataset_dict["ego_lie_vec"] = lie_algebra.rot_to_lie_vec( torch.as_tensor(pose[:3, :3].astype("float32")[None]))[0] elif pnp_net_cfg.ROT_TYPE == "allo_lie_vec": dataset_dict["allo_lie_vec"] = lie_algebra.rot_to_lie_vec( torch.as_tensor(allo_pose[:3, :3].astype("float32"))[None])[0] else: raise ValueError(f"Unknown rot type: {pnp_net_cfg.ROT_TYPE}") dataset_dict["ego_rot"] = torch.as_tensor( pose[:3, :3].astype("float32")) dataset_dict["trans"] = torch.as_tensor( inst_infos["trans"].astype("float32")) dataset_dict["roi_points"] = torch.as_tensor( self._get_model_points(dataset_name)[roi_cls].astype("float32")) dataset_dict["sym_info"] = self._get_sym_infos(dataset_name)[roi_cls] dataset_dict["roi_img"] = torch.as_tensor( roi_img.astype("float32")).contiguous() dataset_dict["roi_coord_2d"] = torch.as_tensor( roi_coord_2d.astype("float32")).contiguous() dataset_dict["roi_mask_trunc"] = torch.as_tensor( roi_mask_trunc.astype("float32")).contiguous() dataset_dict["roi_mask_visib"] = torch.as_tensor( roi_mask_visib.astype("float32")).contiguous() dataset_dict["roi_mask_obj"] = torch.as_tensor( roi_mask_obj.astype("float32")).contiguous() dataset_dict["bbox_center"] = torch.as_tensor(bbox_center, dtype=torch.float32) dataset_dict["scale"] = scale dataset_dict["bbox"] = anno["bbox"] # NOTE: original bbox dataset_dict["roi_wh"] = torch.as_tensor( np.array([bw, bh], dtype=np.float32)) dataset_dict["resize_ratio"] = resize_ratio = out_res / scale z_ratio = inst_infos["trans"][2] / resize_ratio obj_center = anno["centroid_2d"] delta_c = obj_center - bbox_center dataset_dict["trans_ratio"] = torch.as_tensor( [delta_c[0] / bw, delta_c[1] / bh, z_ratio]).to(torch.float32) return dataset_dict
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DetectionTransform` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. instances: Instances Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: losses (dict[str: Tensor]): mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] # apply the TensorMask head pred_logits, pred_deltas, pred_masks = self.head(features) # generate anchors based on features, is it image specific? anchors, unit_lengths, indexes = self.anchor_generator(features) if self.training: # get ground truths for class labels and box targets, it will label each anchor gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth( anchors, unit_lengths, indexes, gt_instances ) # compute the loss return self.losses( gt_class_info, gt_delta_info, gt_mask_info, num_fg, pred_logits, pred_deltas, pred_masks, ) else: # do inference to get the output results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images) processed_results = [] for results_im, input_im, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_im.get("height", image_size[0]) width = input_im.get("width", image_size[1]) # this is to do post-processing with the image size result_box, result_mask = results_im r = _postprocess(result_box, result_mask, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ if not self.training and not self.visualize_path: return self.single_test(batched_inputs) with timer.env("preprocess"): images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) with timer.env("backbone"): features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10, ) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.refinement_head.ignore_value, ).tensor else: gt_sem_seg = None with timer.env("fcose"): proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) edge_map, head_losses, proposals = self.refinement_head( features, proposals, (gt_sem_seg, [gt_instances, images.image_sizes])) # In training, the proposals are not useful at all in RPN models; but not here # This makes RPN-only models about 5% slower. if self.training: timer.reset() proposal_losses.update(head_losses) return proposal_losses processed_results = [] with timer.env("postprocess"): for per_edge_map, results_per_image, input_per_image, image_size in zip( edge_map, proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) # TODO (OPT): NO need for interpolate then back for real speed test with timer.env("extra"): edge_map_r = edge_map_postprocess(per_edge_map, image_size, height, width) instance_r = detector_postprocess( self.semantic_filter, self.semantic_filter_th, self.mask_result_src, results_per_image, height, width, self.roi_size, self.need_concave_hull, self.re_compute_box, ) processed_results.append( { "instances": instance_r, "edge_map": edge_map_r }, ) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.head(features) anchors = self.anchor_generator(features) if self.training: gt_classes, gt_anchors_reg_deltas = self.get_ground_truth( anchors, gt_instances) losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): offset_x = input_per_image.get("offset_x", 0) offset_y = input_per_image.get("offset_y", 0) real_w = input_per_image.get("real_w", image_size[1]) real_h = input_per_image.get("real_h", image_size[0]) results_per_image.pred_boxes.tensor[:, 0::2] -= offset_x results_per_image.pred_boxes.tensor[:, 1::2] -= offset_y results_per_image.pred_boxes.clip((real_h, real_w)) results_per_image._image_size = (real_h, real_w) height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) # preprocess data # images for input images = self.preprocess_image(batched_inputs) # instance for object detection if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None # classification data if "multi_labels" in batched_inputs[0]: classifier_targets = [o['multi_labels'] for o in batched_inputs] else: classifier_targets = None # segmentation data if 'sem_seg' in batched_inputs[0]: segmentation_targets = self.preprocess_semseg_image(batched_inputs) else: segmentation_targets = None # backbone extract features features = self.backbone(images.tensor) # task: object detection if self.proposal_generator: proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) # task: multi-label classification if self.classifier is not None: # classifier_features = [features[f] for f in self.classifier_in_features] classifier_features = features[self.classifier_in_features[0]] _, multi_label_losses = self.classifier(classifier_features, classifier_targets) else: multi_label_losses = {} # task: metal segmentation model if self.metal_segmentation is not None: segmentation_features = features[self.metal_segmentation_in_features[0]] _, segmentation_losses = self.metal_segmentation(segmentation_features, segmentation_targets) else: segmentation_losses = {} # visualize if self.vis_period > 0: # vis_period > 0, 就添加图像可视化 storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) # loss dict losses = {} losses.update(detector_losses) losses.update(proposal_losses) losses.update(multi_label_losses) losses.update(segmentation_losses) multi_loss = self.multi_loss_layer(losses) return losses, multi_loss
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None if self.backbone_2: RGB_tensor = images.tensor[:, :3, :, :] thermal_tensor = images.tensor[:, 3:, :, :] features_RGB = self.backbone(RGB_tensor) if self.blur_rgb: features_RGB = self.apply_Gaussian_blur(features_RGB) features_thermal = self.backbone(thermal_tensor) features = {} for key in features_RGB.keys(): if self.max_pool_rgb: max_pooling = nn.MaxPool2d(3, stride=1, padding=1) features_RGB[key] = max_pooling(features_RGB[key]) features[key] = torch.cat( (features_RGB[key], features_thermal[key]), 1) del features_RGB, features_thermal else: features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} """ if len(gt_instances[0].gt_classes) == 0 or len(gt_instances[1].gt_classes) == 0: import pdb; pdb.set_trace() """ _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ if self.export_onnx: # skip the preprocess net_input = batched_inputs else: # do preprocess images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None # for fashion classification task if "classification" in batched_inputs[0]: gt_classification = [ x["classification"].to(self.device) for x in batched_inputs ] else: gt_classification = None net_input = images.tensor features = self.backbone(net_input) features = [features[f] for f in self.in_features] detection_logits, detection_bbox_reg, classification_logits = self.head( features) if self.export_onnx: # skip the postprocess and return tuple of tensor(onnx needed!) return detection_logits, detection_bbox_reg, classification_logits anchors = self.anchor_generator(features) if self.training: losses = {} gt_classes, gt_anchors_reg_deltas = self.get_ground_truth( anchors, gt_instances, gt_classification) losses.update( self.detection_losses(gt_classes, gt_anchors_reg_deltas, detection_logits, detection_bbox_reg)) gt_classification_classes = self.get_classification_ground_truth( gt_classification) losses.update( self.classification_losses(gt_classification_classes, classification_logits)) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference(detection_logits, detection_bbox_reg, anchors, images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(detection_logits, detection_bbox_reg, anchors, images.image_sizes) category2_results = self.inference_classification( classification_logits) processed_results = [] for results_per_image, input_per_image, image_size, category2 in zip( results, batched_inputs, images.image_sizes, category2_results): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) # if category is not commodity or model # r = self.filter_output_objects(category2, r) processed_results.append({ "instances": r, "classification": category2 }) return processed_results
def forward(self, batched_inputs): images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0] : gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) if not self.training: if 'instance' in self.gt_input: assert gt_instances is not None for im_i in range(len(gt_instances)): gt_instances_per_im = gt_instances[im_i] bboxes = gt_instances_per_im.gt_boxes.tensor instances_per_im = Instances(proposals[im_i]._image_size) instances_per_im.pred_boxes = Boxes(bboxes) instances_per_im.pred_classes = gt_instances_per_im.gt_classes instances_per_im.scores = torch.ones_like(gt_instances_per_im.gt_classes).to(bboxes.device) if gt_instances_per_im.has("gt_masks"): gt_masks = gt_instances_per_im.gt_masks ext_pts_off = self.refinement_head.refine_head.get_simple_extreme_points( gt_masks.polygons).to(bboxes.device) ex_t = torch.stack([ext_pts_off[:, None, 0], bboxes[:, None, 1]], dim=2) ex_l = torch.stack([bboxes[:, None, 0], ext_pts_off[:, None, 1]], dim=2) ex_b = torch.stack([ext_pts_off[:, None, 2], bboxes[:, None, 3]], dim=2) ex_r = torch.stack([bboxes[:, None, 2], ext_pts_off[:, None, 3]], dim=2) instances_per_im.ext_points = ExtremePoints( torch.cat([ex_t, ex_l, ex_b, ex_r], dim=1)) else: quad = self.refinement_head.refine_head.get_quadrangle(bboxes).view(-1, 4, 2) instances_per_im.ext_points = ExtremePoints(quad) proposals[im_i] = instances_per_im head_losses, proposals = self.refinement_head(features, proposals, gt_instances) # In training, the proposals are not useful at all in RPN models; but not here # This makes RPN-only models about 5% slower. if self.training: proposal_losses.update(head_losses) return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) instance_r = detector_postprocess(results_per_image, height, width) processed_results.append( {"instances": instance_r} ) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DetectionTransform` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. instances: Instances Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: losses (dict[str: Tensor]): mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None # print(images.image_sizes) # print(images.tensor.size()) features = self.backbone(images.tensor) # for k,v in features.items(): # plt.imshow(v[0].squeeze().mean(0).cpu().numpy()) # plt.show() classify_features = [features[f][0] for f in self.cin_features] points_features =[features[f][1] for f in self.pin_features] ins_features=[features[f][0] for f in self.ins_features] # apply the head # print(classify_features[0].size()) pf_b,pf_c,pf_h,pf_w=points_features[-1].size() target_points=points_features[-1].new_zeros(pf_b,2,pf_h,pf_w,requires_grad=False) pred_digits=self.cls_head(classify_features) pred_points=self.pc_head(target_points,points_features) if self.training: # get ground truths for class labels and box targets, it will label each anchor output_size=classify_features[-1].size() gt_clses, gt_belongs, gt_masks, gt_ins= self.get_ground_truth(gt_instances,output_size) # compute the loss return self.losses( gt_clses, gt_belongs, gt_masks, gt_ins, pred_digits, pred_points, ins_features[0] ) else: # do inference to get the output results = self.inference(pred_digits, pred_points,ins_features[0],images) # plt.imshow(np.max(pred_digits[0].cpu().numpy(),0)) # plt.show() # self.visualize_training(batched_inputs,results) processed_results = [] for results_im, input_im, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_im.get("height", image_size[0]) width = input_im.get("width", image_size[1]) # this is to do post-processing with the image size # print(height,width,image_size) result= results_im r = _postprocess(result,height, width) processed_results.append({"instances": r}) return processed_results