class DetectionLayer(KE.Layer): """ Takes classified proposal boxes and their bounding box deltas and returns the final detection boxes. Returns: [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where coordinates are normalized. """ def __init__(self, batch_size, **kwargs): super(DetectionLayer, self).__init__(**kwargs) self.batch_size = batch_size self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES self.image_utils = ImageUtils() self.bbox_utils = BboxUtil() self.misc_utils = MiscUtils() def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = self.image_utils.parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = self.bbox_utils.norm_boxes_graph(m['window'], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = self.misc_utils.batch_slice( [rois, mrcnn_class, mrcnn_bbox, window], lambda x, y, w, z: refine_detections_graph(x, y, w, z), self.batch_size) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in # normalized coordinates return tf.reshape(detections_batch, [self.batch_size, self.detection_max_instances, 6]) def compute_output_shape(self, input_shape): return (None, self.detection_max_instances, 6)
class DetectionTargetLayer(KE.Layer): """ Subsamples proposals and generates target box refinement, class_ids, and masks for each. Inputs: proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals. gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs. gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks. rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs. target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)] target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width] Masks cropped to bbox boundaries and resized to neural network output size. Note: Returned arrays might be zero padded if not enough target ROIs. """ def __init__(self, batch_size, **kwargs): super(DetectionTargetLayer, self).__init__(**kwargs) self.batch_size = batch_size self.misc_utils = MiscUtils() self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE self.mask_shape = cfg.TRAIN.MASK_SHAPE pass def call(self, inputs): """ 这里的 call 方法,会被 __init__() 方法回调 :param inputs: 参数如下所示 :return: """ proposals = inputs[0] gt_class_ids = inputs[1] gt_boxes = inputs[2] gt_masks = inputs[3] # Slice the batch and run a graph for each slice # TODO: Rename target_bbox to target_deltas for clarity names = ["rois", "target_class_ids", "target_bbox", "target_mask"] outputs = self.misc_utils.batch_slice( [proposals, gt_class_ids, gt_boxes, gt_masks], lambda w, x, y, z: self.misc_utils.detection_targets_graph( w, x, y, z), self.batch_size, names=names) return outputs def compute_output_shape(self, input_shape): return [ (None, self.rois_per_image, 4), # rois (None, self.rois_per_image), # class_ids (None, self.rois_per_image, 4), # deltas (None, self.rois_per_image, self.mask_shape[0], self.mask_shape[1] ) # masks ] def compute_mask(self, inputs, mask=None): return [None, None, None, None]
class ProposalLayer(KE.Layer): """ Receives anchor scores and selects a subset to pass as proposals to the second stage. Filtering is done based on anchor scores and non-max suppression to remove overlaps. It also applies bounding box refinement deltas to anchors. Inputs: rpn_probs: [batch, num_anchors, (bg prob, fg prob)] rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))] anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates Returns: Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)] """ def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs): super(ProposalLayer, self).__init__(**kwargs) self.proposal_count = proposal_count self.nms_threshold = nms_threshold self.batch_size = batch_size self.misc_utils = MiscUtils() self.bbox_utils = BboxUtil() pass def call(self, inputs): """ 这里的 call 方法,会被 __init__() 方法回调 :param inputs: :return: """ # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1] scores = inputs[0][:, :, 1] # Box deltas [batch, num_rois, 4] deltas = inputs[1] rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV) deltas = deltas * np.reshape(rpn_bbox_std_dev, [1, 1, 4]) # Anchors anchors = inputs[2] # Improve performance by trimming to top anchors by score # and doing the rest on the smaller subset. pre_nms_limit = tf.minimum(cfg.COMMON.PRE_NMS_LIMIT, tf.shape(anchors)[1]) ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices scores = self.misc_utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y), self.batch_size) deltas = self.misc_utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y), self.batch_size) pre_nms_anchors = self.misc_utils.batch_slice( [anchors, ix], lambda a, x: tf.gather(a, x), self.batch_size, names=["pre_nms_anchors"]) # Apply deltas to anchors to get refined anchors. # [batch, N, (y1, x1, y2, x2)] boxes = self.misc_utils.batch_slice( [pre_nms_anchors, deltas], lambda x, y: self.bbox_utils.apply_box_deltas_graph(x, y), self.batch_size, names=["refined_anchors"]) # Clip to image boundaries. Since we're in normalized coordinates, # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)] window = np.array([0, 0, 1, 1], dtype=np.float32) boxes = self.misc_utils.batch_slice( boxes, lambda x: self.bbox_utils.clip_boxes_graph(x, window), self.batch_size, names=["refined_anchors_clipped"]) # Filter out small boxes # According to Xinlei Chen's paper, this reduces detection accuracy # for small objects, so we're skipping it. # Non-max suppression def nms(boxes, scores): indices = tf.image.non_max_suppression( boxes, scores, self.proposal_count, self.nms_threshold, name="rpn_non_max_suppression") proposals = tf.gather(boxes, indices) # Pad if needed padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0) proposals = tf.pad(proposals, [(0, padding), (0, 0)]) return proposals proposals = self.misc_utils.batch_slice([boxes, scores], nms, self.batch_size) return proposals def compute_output_shape(self, input_shape): return (None, self.proposal_count, 4)