class DetectionLayer(KE.Layer): """ Takes classified proposal boxes and their bounding box deltas and returns the final detection boxes. Returns: [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where coordinates are normalized. """ def __init__(self, batch_size, **kwargs): super(DetectionLayer, self).__init__(**kwargs) self.batch_size = batch_size self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES self.image_utils = ImageUtils() self.bbox_utils = BboxUtil() self.misc_utils = MiscUtils() def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = self.image_utils.parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = self.bbox_utils.norm_boxes_graph(m['window'], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = self.misc_utils.batch_slice( [rois, mrcnn_class, mrcnn_bbox, window], lambda x, y, w, z: refine_detections_graph(x, y, w, z), self.batch_size) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in # normalized coordinates return tf.reshape(detections_batch, [self.batch_size, self.detection_max_instances, 6]) def compute_output_shape(self, input_shape): return (None, self.detection_max_instances, 6)
class PyramidROIAlign(KE.Layer): """ Implements ROI Pooling on multiple levels of the feature pyramid. Inputs: - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. Possibly padded with zeros if not enough boxes to fill the array. - image_meta: [batch, (meta data)] Image details. See compose_image_meta() - feature_maps: List of feature maps from different levels of the pyramid. Each is [batch, height, width, channels] Output: Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels]. The width and height are those specific in the pool_shape in the layer constructor. """ def __init__(self, pool_shape, **kwargs): super(PyramidROIAlign, self).__init__(**kwargs) self.pool_shape = tuple(pool_shape) self.image_utils = ImageUtils() def call(self, inputs): # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Image meta # Holds details about the image. See compose_image_meta() image_meta = inputs[1] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[2:] # Assign each ROI to a level in the pyramid based on the ROI area. y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) h = y2 - y1 w = x2 - x1 # Use shape of first image. Images in a batch must have the same size. image_shape = self.image_utils.parse_image_meta_graph( image_meta)['image_shape'][0] # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32) roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) roi_level = tf.minimum( 5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32))) roi_level = tf.squeeze(roi_level, 2) # Loop through levels and apply ROI pooling to each. P2 to P5. pooled = [] box_to_level = [] for i, level in enumerate(range(2, 6)): ix = tf.where(tf.equal(roi_level, level)) level_boxes = tf.gather_nd(boxes, ix) # Box indices for crop_and_resize. box_indices = tf.cast(ix[:, 0], tf.int32) # Keep track of which box is mapped to which level box_to_level.append(ix) # Stop gradient propogation to ROI proposals level_boxes = tf.stop_gradient(level_boxes) box_indices = tf.stop_gradient(box_indices) # Crop and Resize # From Mask R-CNN paper: "We sample four regular locations, so # that we can evaluate either max or average pooling. In fact, # interpolating only a single value at each bin center (without # pooling) is nearly as effective." # # Here we use the simplified approach of a single value per bin, # which is how it's done in tf.crop_and_resize() # Result: [batch * num_boxes, pool_height, pool_width, channels] pooled.append( tf.image.crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear")) # Pack pooled features into one tensor pooled = tf.concat(pooled, axis=0) # Pack box_to_level mapping into one array and add another # column representing the order of pooled boxes box_to_level = tf.concat(box_to_level, axis=0) box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1) # Rearrange pooled features to match the order of the original boxes # Sort box_to_level by batch then box index # TF doesn't have a way to sort by two columns, so merge them and sort. sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1] ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] ix = tf.gather(box_to_level[:, 2], ix) pooled = tf.gather(pooled, ix) # Re-add the batch dimension shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0) pooled = tf.reshape(pooled, shape) return pooled def compute_output_shape(self, input_shape): return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
class MaskRCNN(object): def __init__(self, train_flag=True): """ :param train_flag: 是否为训练,训练为 True,测试为 False """ self.train_flag = train_flag self.bbox_util = BboxUtil() self.anchor_utils = AnchorUtils() self.image_utils = ImageUtils() self.mask_util = MaskUtil() # 模型 路径 self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH # batch size self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE # 模型保存路径 self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH self.backbone = cfg.COMMON.BACKBONE self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES # 输入图像 self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE) # 用于构建特征金字塔的自顶向下层的大小 self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD self.class_num = cfg.COMMON.CLASS_NUM self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO self.keras_model = self.build() pass def build(self): # image shape h, w, c = self.image_shape[:] print("image_shape: {}".format(self.image_shape)) if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = kl.Input(shape=[None, None, c], name="input_image") input_image_meta = kl.Input(shape=[cfg.COMMON.IMAGE_META_SIZE], name="input_image_meta") # 训练 if self.train_flag: # RPN GT input_rpn_match = kl.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = kl.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = kl.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = kl.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if cfg.TRAIN.USE_MINI_MASK: min_h, min_w = cfg.TRAIN.MINI_MASK_SHAPE[:] input_gt_masks = kl.Input(shape=[min_h, min_w, None], name="input_gt_masks", dtype=bool) else: input_gt_masks = kl.Input(shape=[h, w, None], name="input_gt_masks", dtype=bool) pass # anchor anchors = self.anchor_utils.get_anchors(self.image_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (self.batch_size, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) pass else: # Anchors in normalized coordinates anchors = kl.Input(shape=[None, 4], name="input_anchors") # 上面训练用到的参数,测试不需要,但是在 if else 里面定义一下,免得 undefined input_rpn_match = None input_rpn_bbox = None input_gt_class_ids = None gt_boxes = None input_gt_boxes = None input_gt_masks = None pass # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. _, c2, c3, c4, c5 = backbone.resnet_graph(input_image, self.backbone, stage5=True) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config p5 = kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5) p4 = kl.Add(name="fpn_p4add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(p5), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4) ]) p3 = kl.Add(name="fpn_p3add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(p4), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3) ]) p2 = kl.Add(name="fpn_p2add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(p3), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. p2 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p2")(p2) p3 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p3")(p3) p4 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p4")(p4) p5 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p5")(p5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. p6 = kl.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(p5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] # RPN Model rpn = common.build_rpn_model(self.rpn_anchor_stride, len(self.rpn_anchor_ratios), self.top_down_pyramid_size) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) pass # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ kl.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = cfg.TRAIN.POST_NMS_ROIS if self.train_flag else cfg.TEST.POST_NMS_ROIS rpn_rois = common.ProposalLayer( proposal_count=proposal_count, nms_threshold=self.rpn_nms_threshold, batch_size=self.batch_size, name="ROI")([rpn_class, rpn_bbox, anchors]) fc_layer_size = cfg.COMMON.FPN_CLASS_FC_LAYERS_SIZE pool_size = cfg.COMMON.POOL_SIZE mask_pool_size = cfg.COMMON.MASK_POOL_SIZE train_or_freeze = cfg.COMMON.TRAIN_FLAG if self.train_flag: # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = kl.Lambda( lambda x: self.image_utils.parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not cfg.TRAIN.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = kl.Input(shape=[proposal_count, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = kl.Lambda( lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois input_rois = None # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ common.DetectionTargetLayer(self.batch_size, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) mrcnn_mask = common.build_fpn_mask_graph( rois, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) # TODO: clean up (use tf.identify if necessary) output_rois = kl.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = kl.Lambda( lambda x: common.rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits]) rpn_bbox_loss = kl.Lambda( lambda x: common.rpn_bbox_loss_graph(self.batch_size, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = kl.Lambda(lambda x: common.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = kl.Lambda(lambda x: common.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) mask_loss = kl.Lambda(lambda x: common.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([ target_mask, target_class_ids, mrcnn_mask ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not cfg.TRAIN.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = km.Model(inputs, outputs, name='mask_rcnn') pass else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rpn_rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = common.DetectionLayer(self.batch_size, name="mrcnn_detection")([ rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta ]) # Create masks for detections detection_boxes = kl.Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = common.build_fpn_mask_graph( detection_boxes, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) model = km.Model([input_image, input_image_meta, anchors], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') pass # Add multi-GPU support. 多 GPU 操作 gpu_count = cfg.COMMON.GPU_COUNT if gpu_count > 1: from m_rcnn.parallel_model import ParallelModel model = ParallelModel(model, gpu_count) return model pass def load_weights(self, model_path, by_name=False, exclude=None): """ Modified version of the corresponding Keras function with the addition of multi-GPU support and the ability to exclude some layers from loading. :param model_path: :param by_name: :param exclude: list of layer names to exclude :return: """ if exclude: by_name = True pass if h5py is None: raise ImportError('`load_weights` requires h5py.') pass model_file = h5py.File(model_path, mode='r') if 'layer_names' not in model_file.attrs and 'model_weights' in model_file: model_file = model_file['model_weights'] # In multi-GPU training, we wrap the model. Get layers # of the inner model because they have the weights. keras_model = self.keras_model layers = keras_model.inner_model.layers if hasattr( keras_model, "inner_model") else keras_model.layers print("layers: {}".format(layers)) # Exclude some layers if exclude: layers = filter(lambda l: l.name not in exclude, layers) if by_name: saving.load_weights_from_hdf5_group_by_name(model_file, layers) else: saving.load_weights_from_hdf5_group(model_file, layers) if hasattr(model_file, 'close'): model_file.close() pass def generate_random_rois(self, image_shape, count, gt_boxes): """ Generates ROI proposals similar to what a region proposal network would generate. :param image_shape: [Height, Width, Depth] :param count: Number of ROIs to generate :param gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels. :return: """ # placeholder rois = np.zeros((count, 4), dtype=np.int32) # Generate random ROIs around GT boxes (90% of count) rois_per_box = int(0.9 * count / gt_boxes.shape[0]) for i in range(gt_boxes.shape[0]): gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i] h = gt_y2 - gt_y1 w = gt_x2 - gt_x1 # random boundaries r_y1 = max(gt_y1 - h, 0) r_y2 = min(gt_y2 + h, image_shape[0]) r_x1 = max(gt_x1 - w, 0) r_x2 = min(gt_x2 + w, image_shape[1]) # To avoid generating boxes with zero area, we generate double what # we need and filter out the extra. If we get fewer valid boxes # than we need, we loop and try again. while True: y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2)) x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2)) # Filter out zero area boxes threshold = 1 y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:rois_per_box] x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:rois_per_box] if y1y2.shape[0] == rois_per_box and x1x2.shape[ 0] == rois_per_box: break # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape # into x1, y1, x2, y2 order x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) box_rois = np.hstack([y1, x1, y2, x2]) rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois # Generate random ROIs anywhere in the image (10% of count) remaining_count = count - (rois_per_box * gt_boxes.shape[0]) # To avoid generating boxes with zero area, we generate double what # we need and filter out the extra. If we get fewer valid boxes # than we need, we loop and try again. while True: y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2)) x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2)) # Filter out zero area boxes threshold = 1 y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:remaining_count] x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:remaining_count] if y1y2.shape[0] == remaining_count and x1x2.shape[ 0] == remaining_count: break # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape # into x1, y1, x2, y2 order x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) global_rois = np.hstack([y1, x1, y2, x2]) rois[-remaining_count:] = global_rois return rois pass def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks): """ Generate targets for training Stage 2 classifier and mask heads. This is not used in normal training. It's useful for debugging or to train the Mask RCNN heads without using the RPN head. :param rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes. :param gt_class_ids: [instance count] Integer class IDs :param gt_boxes: [instance count, (y1, x1, y2, x2)] :param gt_masks: [height, width, instance count] Ground truth masks. Can be full size or mini-masks. :return: rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific bbox refinements. masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped to bbox boundaries and resized to neural network output size. """ assert rpn_rois.shape[0] > 0 assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format( gt_class_ids.dtype) assert gt_boxes.dtype == np.int32, "Expected int but got {}".format( gt_boxes.dtype) assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format( gt_masks.dtype) # It's common to add GT Boxes to ROIs but we don't do that here because # according to XinLei Chen's paper, it doesn't help. # Trim empty padding in gt_boxes and gt_masks parts instance_ids = np.where(gt_class_ids > 0)[0] assert instance_ids.shape[0] > 0, "Image must contain instances." gt_class_ids = gt_class_ids[instance_ids] gt_boxes = gt_boxes[instance_ids] gt_masks = gt_masks[:, :, instance_ids] # Compute areas of ROIs and ground truth boxes. # rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1]) # gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1]) # Compute overlaps [rpn_rois, gt_boxes] overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0])) for i in range(overlaps.shape[1]): gt = gt_boxes[i] overlaps[:, i] = self.bbox_util.compute_iou(gt, rpn_rois) pass # Assign ROIs to GT boxes rpn_roi_iou_argmax = np.argmax(overlaps, axis=1) rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_roi_iou_argmax] # GT box assigned to each ROI rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax] rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax] # Positive ROIs are those with >= 0.5 IoU with a GT box. fg_ids = np.where(rpn_roi_iou_max > 0.5)[0] # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining) # TODO: To hard example mine or not to hard example mine, that's the question # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0] bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] # Subsample ROIs. Aim for 33% foreground. # FG fg_roi_count = int(self.rois_per_image * self.roi_positive_ratio) if fg_ids.shape[0] > fg_roi_count: keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False) else: keep_fg_ids = fg_ids # BG remaining = self.rois_per_image - keep_fg_ids.shape[0] if bg_ids.shape[0] > remaining: keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) else: keep_bg_ids = bg_ids # Combine indices of ROIs to keep keep = np.concatenate([keep_fg_ids, keep_bg_ids]) # Need more? remaining = self.rois_per_image - keep.shape[0] if remaining > 0: # Looks like we don't have enough samples to maintain the desired # balance. Reduce requirements and fill in the rest. This is # likely different from the Mask RCNN paper. # There is a small chance we have neither fg nor bg samples. if keep.shape[0] == 0: # Pick bg regions with easier IoU threshold bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] assert bg_ids.shape[0] >= remaining keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) assert keep_bg_ids.shape[0] == remaining keep = np.concatenate([keep, keep_bg_ids]) else: # Fill the rest with repeated bg rois. keep_extra_ids = np.random.choice(keep_bg_ids, remaining, replace=True) keep = np.concatenate([keep, keep_extra_ids]) assert keep.shape[0] == self.rois_per_image, \ "keep doesn't match ROI batch size {}, {}".format(keep.shape[0], self.rois_per_image) # Reset the gt boxes assigned to BG ROIs. rpn_roi_gt_boxes[keep_bg_ids, :] = 0 rpn_roi_gt_class_ids[keep_bg_ids] = 0 # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement. rois = rpn_rois[keep] roi_gt_boxes = rpn_roi_gt_boxes[keep] roi_gt_class_ids = rpn_roi_gt_class_ids[keep] roi_gt_assignment = rpn_roi_iou_argmax[keep] # Class-aware bbox deltas. [y, x, log(h), log(w)] bboxes = np.zeros((self.rois_per_image, self.class_num, 4), dtype=np.float32) pos_ids = np.where(roi_gt_class_ids > 0)[0] bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = self.bbox_util.box_refinement( rois[pos_ids], roi_gt_boxes[pos_ids, :4]) # Normalize bbox refinements bbox_std_dev = np.array(cfg.COMMON.BBOX_STD_DEV) bboxes /= bbox_std_dev # Generate class-specific target masks masks = np.zeros((self.rois_per_image, self.image_shape[0], self.image_shape[1], self.class_num), dtype=np.float32) for i in pos_ids: class_id = roi_gt_class_ids[i] assert class_id > 0, "class id must be greater than 0" gt_id = roi_gt_assignment[i] class_mask = gt_masks[:, :, gt_id] if cfg.TRAIN.USE_MINI_MASK: # Create a mask placeholder, the size of the image placeholder = np.zeros(self.image_shape[:2], dtype=bool) # GT box gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id] gt_w = gt_x2 - gt_x1 gt_h = gt_y2 - gt_y1 # Resize mini mask to size of GT box placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \ np.round(self.image_utils.resize(class_mask, (gt_h, gt_w))).astype(bool) # Place the mini batch in the placeholder class_mask = placeholder # Pick part of the mask and resize it y1, x1, y2, x2 = rois[i].astype(np.int32) m = class_mask[y1:y2, x1:x2] mask = self.image_utils.resize(m, self.image_shape) masks[i, :, :, class_id] = mask return rois, roi_gt_class_ids, bboxes, masks pass # ############################################################################################# # test # ############################################################################################# def detect(self, images_info_list, verbose=0): """ Runs the detection pipeline. :param images_info_list: List of images, potentially of different sizes. :param verbose: :return: a list of dicts, one dict per image. The dict contains: rois: [N, (y1, x1, y2, x2)] detection bounding boxes class_ids: [N] int class IDs scores: [N] float probability scores for the class IDs masks: [H, W, N] instance binary masks """ if verbose: print("processing {} image_info.".format(len(images_info_list))) for image_info in images_info_list: print("image_info: {}".format(image_info)) pass pass # Mold inputs to format expected by the neural network molded_images_list, image_metas_list, windows_list = self.image_utils.mode_input( images_info_list) # Validate image sizes # All images in a batch MUST be of the same size image_shape = molded_images_list[0].shape for g in molded_images_list[1:]: assert g.shape == image_shape, \ "After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes." pass # Anchors anchors = self.anchor_utils.get_anchors(image_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (cfg.TEST.BATCH_SIZE, ) + anchors.shape) if verbose: print("molded_images_list: ", molded_images_list) print("image_metas_list: ", image_metas_list) print("anchors: ", anchors) pass # Run object detection detections, _, _, mrcnn_mask, _, _, _ = \ self.keras_model.predict([molded_images_list, image_metas_list, anchors], verbose=0) # Process detections results_list = [] for i, image_info in enumerate(images_info_list): molded_image_shape = molded_images_list[i].shape final_rois, final_class_ids, final_scores, final_masks = self.un_mold_detections( detections[i], mrcnn_mask[i], image_info.shape, molded_image_shape, windows_list[i]) results_list.append({ "rois": final_rois, "class_ids": final_class_ids, "scores": final_scores, "masks": final_masks, }) return results_list pass def un_mold_detections(self, detections, mrcnn_mask, original_image_shape, image_shape, window): """ Reformats the detections of one image from the format of the neural network output to a format suitable for use in the rest of the application. :param detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates :param mrcnn_mask: [N, height, width, num_classes] :param original_image_shape: [H, W, C] Original image shape before resizing :param image_shape: [H, W, C] Shape of the image after resizing and padding :param window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real image is excluding the padding. :return: boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels class_ids: [N] Integer class IDs for each bounding box scores: [N] Float probability scores of the class_id masks: [height, width, num_instances] Instance masks """ # How many detections do we have? # Detections array is padded with zeros. Find the first class_id == 0. zero_ix = np.where(detections[:, 4] == 0)[0] n = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] # Extract boxes, class_ids, scores, and class-specific masks boxes = detections[:n, :4] class_ids = detections[:n, 4].astype(np.int32) scores = detections[:n, 5] masks = mrcnn_mask[np.arange(n), :, :, class_ids] # Translate normalized coordinates in the resized image to pixel # coordinates in the original image before resizing window = self.bbox_util.norm_boxes(window, image_shape[:2]) wy1, wx1, wy2, wx2 = window shift = np.array([wy1, wx1, wy1, wx1]) wh = wy2 - wy1 # window height ww = wx2 - wx1 # window width scale = np.array([wh, ww, wh, ww]) # Convert boxes to normalized coordinates on the window boxes = np.divide(boxes - shift, scale) # Convert boxes to pixel coordinates on the original image boxes = self.bbox_util.denorm_boxes(boxes, original_image_shape[:2]) # Filter out detections with zero area. Happens in early training when # network weights are still random exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0] if exclude_ix.shape[0] > 0: boxes = np.delete(boxes, exclude_ix, axis=0) class_ids = np.delete(class_ids, exclude_ix, axis=0) scores = np.delete(scores, exclude_ix, axis=0) masks = np.delete(masks, exclude_ix, axis=0) n = class_ids.shape[0] # Resize masks to original image size and set boundary threshold. full_masks = [] for i in range(n): # Convert neural network mask to full size mask full_mask = self.mask_util.unmold_mask(masks[i], boxes[i], original_image_shape) full_masks.append(full_mask) pass full_masks = np.stack( full_masks, axis=-1) if full_masks else np.empty(original_image_shape[:2] + (0, )) return boxes, class_ids, scores, full_masks pass