def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = utils.parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = utils.norm_boxes_graph(m['window'], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = utils.batch_slice( [rois, mrcnn_class, mrcnn_bbox, window], lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.bbox_std_dev, self.detection_min_confidence, self.detection_max_instance, self.detection_nms_threshold), self.count_image_per_gpu) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in # normalized coordinates return tf.reshape( detections_batch, [self.size_batch, self.detection_max_instance, 6])
def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = norm_boxes_graph(m['window'], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = utils.batch_slice([ rois, mrcnn_class, mrcnn_bbox, window ], lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config), self.config.IMAGES_PER_GPU) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in # normalized coordinates return tf.reshape( detections_batch, [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image") input_image_meta = Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = Input(shape=[ config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None ], name="input_gt_masks", dtype=bool) else: input_gt_masks = Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) elif mode == "inference": # Anchors in normalized coordinates input_anchors = Input(shape=[None, 4], name="input_anchors") # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. if callable(config.BACKBONE): _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True, train_bn=config.TRAIN_BN) else: _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True, train_bn=config.TRAIN_BN) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in configs P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = add([ UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4) ]) P3 = add([ UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3) ]) P2 = add([ UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if mode == "training": anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) else: anchors = input_anchors # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \ else config.POST_NMS_ROIS_INFERENCE rpn_rois = ProposalLayer(proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", config=config)([rpn_class, rpn_bbox, anchors]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = Lambda(lambda x: parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) # TODO: clean up (use tf.identify if necessary) output_rois = Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")([ input_rpn_bbox, input_rpn_match, rpn_bbox ]) class_loss = Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")( [target_bbox, target_class_ids, mrcnn_bbox]) mask_loss = Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")( [target_mask, target_class_ids, mrcnn_mask]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not config.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Create masks for detections detection_boxes = Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) model = Model([input_image, input_image_meta, input_anchors], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') # Add multi-GPU support. # if configs.GPU_COUNT > 1: # from mrcnn.parallel_model import ParallelModel # model = ParallelModel(model, configs.GPU_COUNT) return model
def build_SPC(inputs, config, is_training, backbone='resnet50'): # Parse the inputs input_image = inputs['input_image'] input_image = input_image - config.MEAN_PIXEL image_shape = config.IMAGE_SHAPE if is_training: # RPN GT input_rpn_match = inputs['input_rpn_match'] input_rpn_bbox = inputs['input_rpn_bbox'] # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = inputs['input_gt_class_ids'] # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in Normalize coordinates input_gt_boxes = inputs["input_gt_boxes"] input_gt_boxes = utils.norm_boxes_graph(input_gt_boxes, tf.shape(input_image)[1:3]) # 3. GT Masks (zero padded) # [batch, MAX_GT_INSTANCES, height, width] input_gt_masks = inputs['input_gt_masks'] # SPCNET gloabel text segmentation input_gt_global_masks = inputs['input_gt_global_masks'] # pyramid_feature Dict{P2, P3, P4, P5} of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] pyramid_feature = build_FPN(input_image, config, is_training, backbone) # get the pyramid feature maps shape fpn_shapes = [] for i in range(2, 6, 1): p = 'P%d' % i shape = pyramid_feature[p].shape fpn_shapes.append([shape[1], shape[2]]) fpn_shapes = np.array(fpn_shapes) # get global text segmentation map and saliency map from per pyramid_feature print('image_shape : ', image_shape) gts, tcm_outputs = build_TCM(pyramid_feature, image_shape, config) # get all anchors anchors = generate_all_anchors(fpn_shapes, image_shape, config) # number of anchors per pixel in the feature map anchors_num = len(config.RPN_ANCHOR_RATIOS) # build rpn model and get outputs rpn_class_logits, rpn_prob, rpn_bbox = build_RPN(tcm_outputs, image_shape, anchors_num, is_training, config) # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if is_training\ else config.POST_NMS_ROIS_INFERENCE rpn_rois = generate_proposal(rpn_prob, rpn_bbox, anchors, proposal_count, config) assert config.USE_RPN_ROIS == True, "Don't use rpn rois not implement" if is_training: # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = generate_detect_target(rpn_rois,\ input_gt_class_ids, input_gt_boxes, input_gt_masks, config) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_prob, mrcnn_bbox = build_mrcnn_head(rois, tcm_outputs,\ image_shape, is_training, config) mrcnn_mask_logits, mrcnn_mask = build_mrcnn_mask(rois, tcm_outputs,\ image_shape, is_training, config) # loss rpn_class_loss = build_rpn_class_loss(input_rpn_match, rpn_class_logits, config) rpn_bbox_loss = build_rpn_bbox_loss(input_rpn_bbox, input_rpn_match, rpn_bbox, config) mrcnn_class_loss = build_mrcnn_class_loss(target_class_ids, mrcnn_class_logits, config) mrcnn_bbox_loss = build_mrcnn_bbox_loss(target_bbox, target_class_ids, mrcnn_bbox, config) mrcnn_mask_loss = build_mrcnn_mask_loss(target_mask, target_class_ids, mrcnn_mask_logits, config) global_mask_loss = build_global_mask_loss(input_gt_global_masks, gts, config) losses = {} losses['rpn_class_loss'] = rpn_class_loss * config.LOSS_WEIGHTS[ 'rpn_class_loss'] losses['rpn_bbox_loss'] = rpn_bbox_loss * config.LOSS_WEIGHTS[ 'rpn_bbox_loss'] losses['mrcnn_class_loss'] = mrcnn_class_loss * config.LOSS_WEIGHTS[ 'mrcnn_class_loss'] losses['mrcnn_bbox_loss'] = mrcnn_bbox_loss * config.LOSS_WEIGHTS[ 'mrcnn_bbox_loss'] losses['mrcnn_mask_loss'] = mrcnn_mask_loss * config.LOSS_WEIGHTS[ 'mrcnn_mask_loss'] losses['global_mask_loss'] = global_mask_loss * config.LOSS_WEIGHTS[ 'global_mask_loss'] losses['total_loss'] = tf.add_n( [losses[k] for k in losses.keys()] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) return losses else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_prob, mrcnn_bbox = build_mrcnn_head(rpn_rois, tcm_outputs,\ image_shape, is_training, config) # Create masks for detections mrcnn_mask_logits, mrcnn_mask = build_mrcnn_mask(rpn_rois, tcm_outputs,\ image_shape, is_training, config) # Reshape global text segmentation map # Original: Dict{P2, P3, P4, P5} each is [batch, H, W, NUM_CLASSES] # Reshape: [Batch, 4, H, W, NUM_CLASSES] stack_gts = [] for i in range(config.BATCH_SIZE): stack_gts.append(tf.stack([gts[P][i,...] for P in ['P2', 'P3', 'P4', 'P5']],\ axis=0)) gts = tf.stack(stack_gts, axis=0) # Detections # output is # detection :[batch, num_detections, (y1, x1, y2, x2, class_id, score)] in normalized coordinates # mask: [batch, num_detections, MASK_H, MASK_W, class_num] detections, masks = get_detect_results(rpn_rois, mrcnn_prob, mrcnn_bbox, mrcnn_mask, gts, config) return detections, masks
def __init__(self, mode, rpn_anchor_ratios, rpn_anchor_scales, mask_shape, pool_size, image_shape, mini_mask_shape, backbone_strides, mean_pixel, roi_size=7, backbone='resnet50', stage5=True, norm='batch', use_bias=True, rpn_anchor_stride=1, image_per_gpu=1, gpu_count=1, detection_max_instances=100, train_rois_per_image=200, num_classes=1, use_mini_mask=True, use_pretrained_model=True, top_down_pyramid_size=256, post_nms_rois_training=2000, post_nms_rois_inference=1000, pre_nms_limit=6000, rpn_nms_threshold=0.7, use_rpn_rois=True, model_dir=None, optimizer_method='Adam', learning_rate=0.001, momentum=0.9, weight_decay=0.0001, image_min_dim=800, image_max_dim=1024, image_min_scale=0.0, image_resize_mode='square', max_gt_instances=100, rpn_train_anchors_per_image=256): assert mode in ['training', 'inference'] assert optimizer_method in ['Adam', 'SGD'] tf.reset_default_graph() self.graph = tf.Graph() self.mode = mode self.rpn_anchor_ratios = rpn_anchor_ratios self.rpn_anchor_scales = rpn_anchor_scales self.mask_shape = mask_shape self.pool_size = pool_size self.image_shape = np.array(image_shape) self.mini_mask_shape = mini_mask_shape self.backbone_strides = backbone_strides self.mean_pixel = mean_pixel self.roi_size = roi_size self.backbone = backbone self.stage5 = stage5 self.norm = norm self.use_bias = use_bias self.rpn_anchor_stride = rpn_anchor_stride self.image_per_gpu = image_per_gpu self.gpu_count = gpu_count self.detection_max_instances = detection_max_instances self.train_rois_per_image = train_rois_per_image self.num_classes = num_classes self.use_mini_mask = use_mini_mask self.use_pretrained_model = use_pretrained_model self.top_down_pyramid_size = top_down_pyramid_size self.post_nms_rois_training = post_nms_rois_training self.post_nms_rois_inference = post_nms_rois_inference self.pre_nms_limit = pre_nms_limit self.rpn_nms_threshold = rpn_nms_threshold self.use_rpn_rois = use_rpn_rois self.model_dir = model_dir self.optimizer_method = optimizer_method self.learning_rate = learning_rate self.momentum = momentum self.weight_decay = weight_decay self.image_min_dim = image_min_dim self.image_max_dim = image_max_dim self.image_min_scale = image_min_scale self.image_resize_mode = image_resize_mode self.max_gt_instances = max_gt_instances self.rpn_train_anchors_per_image = rpn_train_anchors_per_image self.image_meta_size = 1 + 3 + 3 + 4 + 1 + self.num_classes self.reuse = False self._anchor_cache = {} self.batch_size = self.gpu_count * self.image_per_gpu self.backbone_shape = utils.compute_backbone_shapes( self.backbone, self.backbone_strides, self.image_shape) self.num_anchors_per_image = len(self.rpn_anchor_ratios) * ( self.backbone_shape[0][0] * self.backbone_shape[0][0] + self.backbone_shape[1][0] * self.backbone_shape[1][0] + self.backbone_shape[2][0] * self.backbone_shape[2][0] + self.backbone_shape[3][0] * self.backbone_shape[3][0] + self.backbone_shape[4][0] * self.backbone_shape[4][0]) with self.graph.as_default(): self.is_training = tf.placeholder_with_default(False, []) self.input_image = tf.placeholder(dtype=tf.float32, shape=[ None, self.image_shape[0], self.image_shape[1], self.image_shape[2] ], name='input_image') self.input_image_meta = tf.placeholder( dtype=tf.int32, shape=[None, self.image_meta_size], name='input_image_meta') if mode == 'training': self.input_rpn_match = tf.placeholder( dtype=tf.int32, shape=[None, self.num_anchors_per_image, 1], name='input_rpn_match') self.input_rpn_boxes = tf.placeholder( dtype=tf.float32, shape=[None, self.rpn_train_anchors_per_image, 4], name='input_rpn_boxes') self.input_gt_class_ids = tf.placeholder( dtype=tf.int32, shape=[None, self.max_gt_instances], name='input_gt_class_ids') self.input_gt_boxes = tf.placeholder( dtype=tf.float32, shape=[None, self.max_gt_instances, 4], name='input_gt_boxes') self.input_gt_boxes_normalized = utils.norm_boxes_graph( self.input_gt_boxes, tf.shape(self.input_image)[1:3]) self.proposal_count = self.post_nms_rois_training if self.use_mini_mask: self.input_gt_masks = tf.placeholder( dtype=tf.bool, shape=[ None, self.mini_mask_shape[0], self.mini_mask_shape[1], self.max_gt_instances ], name='input_gt_mask') else: self.input_gt_masks = tf.placeholder( dtype=tf.bool, shape=[ None, self.image_shape[0], self.image_shape[1], self.max_gt_instances ], name='input_gt_mask') elif mode == 'inference': self.input_anchors = tf.placeholder(dtype=tf.float32, shape=[None, None, 4], name='input_anchors') self.proposal_count = self.post_nms_rois_inference self.resnet = Resnet(name='resnet', architecture=self.backbone, is_training=self.is_training, stage5=self.stage5, use_bias=self.use_bias) arg_scope = nets.resnet_v2.resnet_arg_scope() with slim.arg_scope(arg_scope): _, self.end_points = nets.resnet_v2.resnet_v2_50( self.input_image, num_classes=None, is_training=self.is_training) self.fpn = FPN(name='fpn', top_down_pyramid_size=self.top_down_pyramid_size, use_bias=self.use_bias) self.rpn = RPN(name='rpn', anchors_per_location=len(self.rpn_anchor_ratios), anchor_stride=self.rpn_anchor_stride, is_training=self.is_training, use_bias=self.use_bias) self.proposal = ProposalLayer(self.pre_nms_limit, self.proposal_count, self.rpn_nms_threshold, self.image_per_gpu) self.pyramidRoiPooling = PyramidRoiPooling( name='PyramidRoiPooling', roi_size=self.roi_size) self.objDetection = ObjDetection( image_per_gpu=self.image_per_gpu, gpu_count=self.gpu_count, detection_max_instances=self.detection_max_instances) self.targetDetection = TargetDetection( mask_shape=self.mask_shape, image_per_gpu=self.image_per_gpu, train_rois_per_image=self.train_rois_per_image) self.fpnClassifier = FpnClassifier('FpnClassifier', pool_size=self.pool_size, num_classes=self.num_classes, is_training=self.is_training) self.fpnMask = FpnMask('FpnMask', num_classes=self.num_classes, is_training=self.is_training)
def model(self): h, w = self.image_shape[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") if self.use_pretrained_model: c2, c3, c4, c5 = \ self.end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], \ self.end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], \ self.end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], \ self.end_points['resnet_v2_50/block4'] else: if callable(self.backbone): _, c2, c3, c4, c5 = self.backbone(self.input_image, stage5=self.stage5, is_training=self.is_training) else: _, c2, c3, c4, c5 = self.resnet(self.input_image) p2, p3, p4, p5, p6 = self.fpn([c2, c3, c4, c5]) rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] if self.mode == 'training': anchors = self.get_anchors(self.image_shape) anchors = np.broadcast_to(anchors, (self.batch_size, ) + anchors.shape) anchors = tf.constant(anchors) else: anchors = self.input_anchors layer_outputs = [] for p in rpn_feature_maps: layer_outputs.append(self.rpn(p)) output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ tf.concat(list(o), name=n, axis=1) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs rpn_rois = self.proposal([rpn_class, rpn_bbox, anchors]) if self.mode == 'training': active_class_ids = utils.parse_image_meta_graph( self.input_image_meta)['active_class_ids'] if not self.use_rpn_rois: input_rois = tf.placeholder( dtype=tf.int32, shape=[None, self.post_nms_rois_training, 4], name='input_rois') target_rois = utils.norm_boxes_graph( input_rois, tf.shape(self.input_image)[1:3]) else: target_rois = rpn_rois rois, target_class_ids, target_bbox, target_mask = \ self.targetDetection([target_rois, self.input_gt_class_ids, self.input_gt_boxes_normalized, self.input_gt_masks]) pooled = self.pyramidRoiPooling([rois, self.input_image_meta] + mrcnn_feature_maps) pooled_mask = self.pyramidRoiPooling( [rois, self.input_image_meta] + mrcnn_feature_maps, pool_size=14) mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier( pooled) mrcnn_mask = self.fpnMask(pooled_mask) output_rois = tf.identity(rois, name='output_rois') rpn_class_loss = layer.rpn_loss(self.input_rpn_match, rpn_class_logits) rpn_bbox_loss = layer.rpn_bbox_loss(self.input_rpn_boxes, self.input_rpn_match, rpn_bbox) class_loss = layer.mrcnn_class_loss(target_class_ids, mrcnn_class_logits, active_class_ids) bbox_loss = layer.mrcnn_bbox_loss(target_bbox, target_class_ids, mrcnn_bbox) mask_loss = layer.mrcnn_mask_loss(target_mask, target_class_ids, mrcnn_mask) tf.summary.scalar('rpn_class_loss', rpn_class_loss) tf.summary.scalar('rpn_bbox_loss', rpn_bbox_loss) tf.summary.scalar('mrcnn_class_loss', class_loss) tf.summary.scalar('mrcnn_bbox_loss', bbox_loss) tf.summary.scalar('mrcnn_mask_loss', mask_loss) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] else: pooled = self.pyramidRoiPooling([rpn_rois, self.input_image_meta] + mrcnn_feature_maps) mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier( pooled) detections = self.objDetection( [rpn_rois, mrcnn_class, mrcnn_bbox, self.input_image_meta]) detections_bbox = detections[..., :4] pooled = self.pyramidRoiPooling( [detections_bbox, self.input_image_meta] + mrcnn_feature_maps, pool_size=14) mrcnn_mask = self.fpnMask(pooled) outputs = [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ] return outputs