def call(self, inputs):
        rois = inputs[0]
        mrcnn_class = inputs[1]
        mrcnn_bbox = inputs[2]
        image_meta = inputs[3]

        # Get windows of images in normalized coordinates. Windows are the area
        # in the image that excludes the padding.
        # Use the shape of the first image in the batch to normalize the window
        # because we know that all images get resized to the same size.
        m = utils.parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = utils.norm_boxes_graph(m['window'], image_shape[:2])

        # Run detection refinement graph on each item in the batch
        detections_batch = utils.batch_slice(
            [rois, mrcnn_class, mrcnn_bbox, window],
            lambda x, y, w, z: refine_detections_graph(x, y, w, z, 
            self.bbox_std_dev, 
            self.detection_min_confidence,
            self.detection_max_instance,
            self.detection_nms_threshold),
            self.count_image_per_gpu)

        # Reshape output
        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
        # normalized coordinates
        return tf.reshape(
            detections_batch,
            [self.size_batch, self.detection_max_instance, 6])
示例#2
0
    def call(self, inputs):
        rois = inputs[0]
        mrcnn_class = inputs[1]
        mrcnn_bbox = inputs[2]
        image_meta = inputs[3]

        # Get windows of images in normalized coordinates. Windows are the area
        # in the image that excludes the padding.
        # Use the shape of the first image in the batch to normalize the window
        # because we know that all images get resized to the same size.
        m = parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = norm_boxes_graph(m['window'], image_shape[:2])

        # Run detection refinement graph on each item in the batch
        detections_batch = utils.batch_slice([
            rois, mrcnn_class, mrcnn_bbox, window
        ], lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
                                             self.config.IMAGES_PER_GPU)

        # Reshape output
        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
        # normalized coordinates
        return tf.reshape(
            detections_batch,
            [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
示例#3
0
    def build(self, mode, config):
        """Build Mask R-CNN architecture.
            input_shape: The shape of the input image.
            mode: Either "training" or "inference". The inputs and
                outputs of the model differ accordingly.
        """
        assert mode in ['training', 'inference']

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Inputs
        input_image = Input(shape=[None, None, config.IMAGE_SHAPE[2]],
                            name="input_image")
        input_image_meta = Input(shape=[config.IMAGE_META_SIZE],
                                 name="input_image_meta")
        if mode == "training":
            # RPN GT
            input_rpn_match = Input(shape=[None, 1],
                                    name="input_rpn_match",
                                    dtype=tf.int32)
            input_rpn_bbox = Input(shape=[None, 4],
                                   name="input_rpn_bbox",
                                   dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = Input(shape=[None],
                                       name="input_gt_class_ids",
                                       dtype=tf.int32)
            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = Input(shape=[None, 4],
                                   name="input_gt_boxes",
                                   dtype=tf.float32)
            # Normalize coordinates
            gt_boxes = Lambda(lambda x: norm_boxes_graph(
                x,
                K.shape(input_image)[1:3]))(input_gt_boxes)
            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if config.USE_MINI_MASK:
                input_gt_masks = Input(shape=[
                    config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None
                ],
                                       name="input_gt_masks",
                                       dtype=bool)
            else:
                input_gt_masks = Input(
                    shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
                    name="input_gt_masks",
                    dtype=bool)
        elif mode == "inference":
            # Anchors in normalized coordinates
            input_anchors = Input(shape=[None, 4], name="input_anchors")

        # Build the shared convolutional layers.
        # Bottom-up Layers
        # Returns a list of the last layers of each stage, 5 in total.
        # Don't create the thead (stage 5), so we pick the 4th item in the list.
        if callable(config.BACKBONE):
            _, C2, C3, C4, C5 = config.BACKBONE(input_image,
                                                stage5=True,
                                                train_bn=config.TRAIN_BN)
        else:
            _, C2, C3, C4, C5 = resnet_graph(input_image,
                                             config.BACKBONE,
                                             stage5=True,
                                             train_bn=config.TRAIN_BN)
        # Top-down Layers
        # TODO: add assert to varify feature map sizes match what's in configs
        P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
        P4 = add([
            UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)
        ])
        P3 = add([
            UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)
        ])
        P2 = add([
            UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)
        ])
        # Attach 3x3 conv to all P layers to get the final feature maps.
        P2 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p2")(P2)
        P3 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p3")(P3)
        P4 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p4")(P4)
        P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p5")(P5)
        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        P6 = MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)

        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [P2, P3, P4, P5, P6]
        mrcnn_feature_maps = [P2, P3, P4, P5]

        # Anchors
        if mode == "training":
            anchors = self.get_anchors(config.IMAGE_SHAPE)
            # Duplicate across the batch dimension because Keras requires it
            # TODO: can this be optimized to avoid duplicating the anchors?
            anchors = np.broadcast_to(anchors,
                                      (config.BATCH_SIZE, ) + anchors.shape)
            # A hack to get around Keras's bad support for constants
            anchors = Lambda(lambda x: tf.Variable(anchors),
                             name="anchors")(input_image)
        else:
            anchors = input_anchors

        # RPN Model
        rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE,
                              len(config.RPN_ANCHOR_RATIOS),
                              config.TOP_DOWN_PYRAMID_SIZE)
        # Loop through pyramid layers
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(rpn([p]))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            Concatenate(axis=1, name=n)(list(o))
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \
            else config.POST_NMS_ROIS_INFERENCE
        rpn_rois = ProposalLayer(proposal_count=proposal_count,
                                 nms_threshold=config.RPN_NMS_THRESHOLD,
                                 name="ROI",
                                 config=config)([rpn_class, rpn_bbox, anchors])

        if mode == "training":
            # Class ID mask to mark class IDs supported by the dataset the image
            # came from.
            active_class_ids = Lambda(lambda x: parse_image_meta_graph(x)[
                "active_class_ids"])(input_image_meta)

            if not config.USE_RPN_ROIS:
                # Ignore predicted ROIs and use ROIs provided as an input.
                input_rois = Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
                                   name="input_roi",
                                   dtype=np.int32)
                # Normalize coordinates
                target_rois = Lambda(lambda x: norm_boxes_graph(
                    x,
                    K.shape(input_image)[1:3]))(input_rois)
            else:
                target_rois = rpn_rois

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            rois, target_class_ids, target_bbox, target_mask = \
                DetectionTargetLayer(config, name="proposal_targets")([
                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

            # Network Heads
            # TODO: verify that this handles zero padded ROIs
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            mrcnn_mask = build_fpn_mask_graph(rois,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            # TODO: clean up (use tf.identify if necessary)
            output_rois = Lambda(lambda x: x * 1, name="output_rois")(rois)

            # Losses
            rpn_class_loss = Lambda(lambda x: rpn_class_loss_graph(*x),
                                    name="rpn_class_loss")(
                                        [input_rpn_match, rpn_class_logits])
            rpn_bbox_loss = Lambda(lambda x: rpn_bbox_loss_graph(config, *x),
                                   name="rpn_bbox_loss")([
                                       input_rpn_bbox, input_rpn_match,
                                       rpn_bbox
                                   ])
            class_loss = Lambda(lambda x: mrcnn_class_loss_graph(*x),
                                name="mrcnn_class_loss")([
                                    target_class_ids, mrcnn_class_logits,
                                    active_class_ids
                                ])
            bbox_loss = Lambda(lambda x: mrcnn_bbox_loss_graph(*x),
                               name="mrcnn_bbox_loss")(
                                   [target_bbox, target_class_ids, mrcnn_bbox])
            mask_loss = Lambda(lambda x: mrcnn_mask_loss_graph(*x),
                               name="mrcnn_mask_loss")(
                                   [target_mask, target_class_ids, mrcnn_mask])

            # Model
            inputs = [
                input_image, input_image_meta, input_rpn_match, input_rpn_bbox,
                input_gt_class_ids, input_gt_boxes, input_gt_masks
            ]
            if not config.USE_RPN_ROIS:
                inputs.append(input_rois)
            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
            model = Model(inputs, outputs, name='mask_rcnn')
        else:
            # Network Heads
            # Proposal classifier and BBox regressor heads
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            # Detections
            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
            # normalized coordinates
            detections = DetectionLayer(config, name="mrcnn_detection")(
                [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])

            # Create masks for detections
            detection_boxes = Lambda(lambda x: x[..., :4])(detections)
            mrcnn_mask = build_fpn_mask_graph(detection_boxes,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            model = Model([input_image, input_image_meta, input_anchors], [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ],
                          name='mask_rcnn')

        # Add multi-GPU support.
        # if configs.GPU_COUNT > 1:
        #     from mrcnn.parallel_model import ParallelModel
        #     model = ParallelModel(model, configs.GPU_COUNT)

        return model
示例#4
0
def build_SPC(inputs, config, is_training, backbone='resnet50'):
    # Parse the inputs
    input_image = inputs['input_image']
    input_image = input_image - config.MEAN_PIXEL
    image_shape = config.IMAGE_SHAPE
    if is_training:
        # RPN GT
        input_rpn_match = inputs['input_rpn_match']
        input_rpn_bbox = inputs['input_rpn_bbox']
        # Detection GT (class IDs, bounding boxes, and masks)
        # 1. GT Class IDs (zero padded)
        input_gt_class_ids = inputs['input_gt_class_ids']
        # 2. GT Boxes in pixels (zero padded)
        # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in Normalize coordinates
        input_gt_boxes = inputs["input_gt_boxes"]
        input_gt_boxes = utils.norm_boxes_graph(input_gt_boxes,
                                                tf.shape(input_image)[1:3])
        # 3. GT Masks (zero padded)
        # [batch, MAX_GT_INSTANCES, height, width]
        input_gt_masks = inputs['input_gt_masks']
        # SPCNET gloabel text segmentation
        input_gt_global_masks = inputs['input_gt_global_masks']

    # pyramid_feature Dict{P2, P3, P4, P5} of feature maps from different level of the
    # feature pyramid. Each is [batch, height, width, channels]
    pyramid_feature = build_FPN(input_image, config, is_training, backbone)
    # get the pyramid feature maps shape
    fpn_shapes = []
    for i in range(2, 6, 1):
        p = 'P%d' % i
        shape = pyramid_feature[p].shape
        fpn_shapes.append([shape[1], shape[2]])
    fpn_shapes = np.array(fpn_shapes)
    # get global text segmentation map and saliency map from per pyramid_feature
    print('image_shape : ', image_shape)
    gts, tcm_outputs = build_TCM(pyramid_feature, image_shape, config)

    # get all anchors
    anchors = generate_all_anchors(fpn_shapes, image_shape, config)
    # number of anchors per pixel in the feature map
    anchors_num = len(config.RPN_ANCHOR_RATIOS)
    # build rpn model and get outputs
    rpn_class_logits, rpn_prob, rpn_bbox = build_RPN(tcm_outputs, image_shape,
                                                     anchors_num, is_training,
                                                     config)
    # Generate proposals
    # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
    # and zero padded.
    proposal_count = config.POST_NMS_ROIS_TRAINING if is_training\
               else config.POST_NMS_ROIS_INFERENCE
    rpn_rois = generate_proposal(rpn_prob, rpn_bbox, anchors, proposal_count,
                                 config)
    assert config.USE_RPN_ROIS == True, "Don't use rpn rois not implement"

    if is_training:
        # Generate detection targets
        # Subsamples proposals and generates target outputs for training
        # Note that proposal class IDs, gt_boxes, and gt_masks are zero
        # padded. Equally, returned rois and targets are zero padded.
        rois, target_class_ids, target_bbox, target_mask = generate_detect_target(rpn_rois,\
              input_gt_class_ids, input_gt_boxes, input_gt_masks, config)
        # Network Heads
        # TODO: verify that this handles zero padded ROIs
        mrcnn_class_logits, mrcnn_prob, mrcnn_bbox = build_mrcnn_head(rois, tcm_outputs,\
              image_shape, is_training, config)
        mrcnn_mask_logits, mrcnn_mask = build_mrcnn_mask(rois, tcm_outputs,\
              image_shape, is_training, config)
        # loss
        rpn_class_loss = build_rpn_class_loss(input_rpn_match,
                                              rpn_class_logits, config)
        rpn_bbox_loss = build_rpn_bbox_loss(input_rpn_bbox, input_rpn_match,
                                            rpn_bbox, config)
        mrcnn_class_loss = build_mrcnn_class_loss(target_class_ids,
                                                  mrcnn_class_logits, config)
        mrcnn_bbox_loss = build_mrcnn_bbox_loss(target_bbox, target_class_ids,
                                                mrcnn_bbox, config)
        mrcnn_mask_loss = build_mrcnn_mask_loss(target_mask, target_class_ids,
                                                mrcnn_mask_logits, config)
        global_mask_loss = build_global_mask_loss(input_gt_global_masks, gts,
                                                  config)

        losses = {}
        losses['rpn_class_loss'] = rpn_class_loss * config.LOSS_WEIGHTS[
            'rpn_class_loss']
        losses['rpn_bbox_loss'] = rpn_bbox_loss * config.LOSS_WEIGHTS[
            'rpn_bbox_loss']
        losses['mrcnn_class_loss'] = mrcnn_class_loss * config.LOSS_WEIGHTS[
            'mrcnn_class_loss']
        losses['mrcnn_bbox_loss'] = mrcnn_bbox_loss * config.LOSS_WEIGHTS[
            'mrcnn_bbox_loss']
        losses['mrcnn_mask_loss'] = mrcnn_mask_loss * config.LOSS_WEIGHTS[
            'mrcnn_mask_loss']
        losses['global_mask_loss'] = global_mask_loss * config.LOSS_WEIGHTS[
            'global_mask_loss']
        losses['total_loss'] = tf.add_n(
            [losses[k] for k in losses.keys()] +
            tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        return losses
    else:
        # Network Heads
        # Proposal classifier and BBox regressor heads
        mrcnn_class_logits, mrcnn_prob, mrcnn_bbox = build_mrcnn_head(rpn_rois, tcm_outputs,\
              image_shape, is_training, config)
        # Create masks for detections
        mrcnn_mask_logits, mrcnn_mask = build_mrcnn_mask(rpn_rois, tcm_outputs,\
              image_shape, is_training, config)
        # Reshape global text segmentation map
        # Original: Dict{P2, P3, P4, P5} each is [batch, H, W, NUM_CLASSES]
        # Reshape:  [Batch, 4, H, W, NUM_CLASSES]
        stack_gts = []
        for i in range(config.BATCH_SIZE):
            stack_gts.append(tf.stack([gts[P][i,...] for P in ['P2', 'P3', 'P4', 'P5']],\
                axis=0))
        gts = tf.stack(stack_gts, axis=0)
        # Detections
        # output is
        # detection :[batch, num_detections, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
        # mask: [batch, num_detections, MASK_H, MASK_W, class_num]
        detections, masks = get_detect_results(rpn_rois, mrcnn_prob,
                                               mrcnn_bbox, mrcnn_mask, gts,
                                               config)

        return detections, masks
示例#5
0
    def __init__(self,
                 mode,
                 rpn_anchor_ratios,
                 rpn_anchor_scales,
                 mask_shape,
                 pool_size,
                 image_shape,
                 mini_mask_shape,
                 backbone_strides,
                 mean_pixel,
                 roi_size=7,
                 backbone='resnet50',
                 stage5=True,
                 norm='batch',
                 use_bias=True,
                 rpn_anchor_stride=1,
                 image_per_gpu=1,
                 gpu_count=1,
                 detection_max_instances=100,
                 train_rois_per_image=200,
                 num_classes=1,
                 use_mini_mask=True,
                 use_pretrained_model=True,
                 top_down_pyramid_size=256,
                 post_nms_rois_training=2000,
                 post_nms_rois_inference=1000,
                 pre_nms_limit=6000,
                 rpn_nms_threshold=0.7,
                 use_rpn_rois=True,
                 model_dir=None,
                 optimizer_method='Adam',
                 learning_rate=0.001,
                 momentum=0.9,
                 weight_decay=0.0001,
                 image_min_dim=800,
                 image_max_dim=1024,
                 image_min_scale=0.0,
                 image_resize_mode='square',
                 max_gt_instances=100,
                 rpn_train_anchors_per_image=256):

        assert mode in ['training', 'inference']
        assert optimizer_method in ['Adam', 'SGD']

        tf.reset_default_graph()
        self.graph = tf.Graph()

        self.mode = mode
        self.rpn_anchor_ratios = rpn_anchor_ratios
        self.rpn_anchor_scales = rpn_anchor_scales
        self.mask_shape = mask_shape
        self.pool_size = pool_size
        self.image_shape = np.array(image_shape)
        self.mini_mask_shape = mini_mask_shape
        self.backbone_strides = backbone_strides
        self.mean_pixel = mean_pixel

        self.roi_size = roi_size
        self.backbone = backbone
        self.stage5 = stage5
        self.norm = norm
        self.use_bias = use_bias
        self.rpn_anchor_stride = rpn_anchor_stride
        self.image_per_gpu = image_per_gpu
        self.gpu_count = gpu_count
        self.detection_max_instances = detection_max_instances
        self.train_rois_per_image = train_rois_per_image
        self.num_classes = num_classes
        self.use_mini_mask = use_mini_mask
        self.use_pretrained_model = use_pretrained_model
        self.top_down_pyramid_size = top_down_pyramid_size
        self.post_nms_rois_training = post_nms_rois_training
        self.post_nms_rois_inference = post_nms_rois_inference
        self.pre_nms_limit = pre_nms_limit
        self.rpn_nms_threshold = rpn_nms_threshold
        self.use_rpn_rois = use_rpn_rois
        self.model_dir = model_dir
        self.optimizer_method = optimizer_method
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.image_min_dim = image_min_dim
        self.image_max_dim = image_max_dim
        self.image_min_scale = image_min_scale
        self.image_resize_mode = image_resize_mode
        self.max_gt_instances = max_gt_instances
        self.rpn_train_anchors_per_image = rpn_train_anchors_per_image

        self.image_meta_size = 1 + 3 + 3 + 4 + 1 + self.num_classes
        self.reuse = False
        self._anchor_cache = {}
        self.batch_size = self.gpu_count * self.image_per_gpu
        self.backbone_shape = utils.compute_backbone_shapes(
            self.backbone, self.backbone_strides, self.image_shape)
        self.num_anchors_per_image = len(self.rpn_anchor_ratios) * (
            self.backbone_shape[0][0] * self.backbone_shape[0][0] +
            self.backbone_shape[1][0] * self.backbone_shape[1][0] +
            self.backbone_shape[2][0] * self.backbone_shape[2][0] +
            self.backbone_shape[3][0] * self.backbone_shape[3][0] +
            self.backbone_shape[4][0] * self.backbone_shape[4][0])

        with self.graph.as_default():

            self.is_training = tf.placeholder_with_default(False, [])
            self.input_image = tf.placeholder(dtype=tf.float32,
                                              shape=[
                                                  None, self.image_shape[0],
                                                  self.image_shape[1],
                                                  self.image_shape[2]
                                              ],
                                              name='input_image')
            self.input_image_meta = tf.placeholder(
                dtype=tf.int32,
                shape=[None, self.image_meta_size],
                name='input_image_meta')

            if mode == 'training':
                self.input_rpn_match = tf.placeholder(
                    dtype=tf.int32,
                    shape=[None, self.num_anchors_per_image, 1],
                    name='input_rpn_match')
                self.input_rpn_boxes = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, self.rpn_train_anchors_per_image, 4],
                    name='input_rpn_boxes')
                self.input_gt_class_ids = tf.placeholder(
                    dtype=tf.int32,
                    shape=[None, self.max_gt_instances],
                    name='input_gt_class_ids')
                self.input_gt_boxes = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, self.max_gt_instances, 4],
                    name='input_gt_boxes')
                self.input_gt_boxes_normalized = utils.norm_boxes_graph(
                    self.input_gt_boxes,
                    tf.shape(self.input_image)[1:3])
                self.proposal_count = self.post_nms_rois_training
                if self.use_mini_mask:
                    self.input_gt_masks = tf.placeholder(
                        dtype=tf.bool,
                        shape=[
                            None, self.mini_mask_shape[0],
                            self.mini_mask_shape[1], self.max_gt_instances
                        ],
                        name='input_gt_mask')
                else:
                    self.input_gt_masks = tf.placeholder(
                        dtype=tf.bool,
                        shape=[
                            None, self.image_shape[0], self.image_shape[1],
                            self.max_gt_instances
                        ],
                        name='input_gt_mask')

            elif mode == 'inference':
                self.input_anchors = tf.placeholder(dtype=tf.float32,
                                                    shape=[None, None, 4],
                                                    name='input_anchors')
                self.proposal_count = self.post_nms_rois_inference

            self.resnet = Resnet(name='resnet',
                                 architecture=self.backbone,
                                 is_training=self.is_training,
                                 stage5=self.stage5,
                                 use_bias=self.use_bias)

            arg_scope = nets.resnet_v2.resnet_arg_scope()
            with slim.arg_scope(arg_scope):
                _, self.end_points = nets.resnet_v2.resnet_v2_50(
                    self.input_image,
                    num_classes=None,
                    is_training=self.is_training)

            self.fpn = FPN(name='fpn',
                           top_down_pyramid_size=self.top_down_pyramid_size,
                           use_bias=self.use_bias)

            self.rpn = RPN(name='rpn',
                           anchors_per_location=len(self.rpn_anchor_ratios),
                           anchor_stride=self.rpn_anchor_stride,
                           is_training=self.is_training,
                           use_bias=self.use_bias)
            self.proposal = ProposalLayer(self.pre_nms_limit,
                                          self.proposal_count,
                                          self.rpn_nms_threshold,
                                          self.image_per_gpu)
            self.pyramidRoiPooling = PyramidRoiPooling(
                name='PyramidRoiPooling', roi_size=self.roi_size)
            self.objDetection = ObjDetection(
                image_per_gpu=self.image_per_gpu,
                gpu_count=self.gpu_count,
                detection_max_instances=self.detection_max_instances)
            self.targetDetection = TargetDetection(
                mask_shape=self.mask_shape,
                image_per_gpu=self.image_per_gpu,
                train_rois_per_image=self.train_rois_per_image)
            self.fpnClassifier = FpnClassifier('FpnClassifier',
                                               pool_size=self.pool_size,
                                               num_classes=self.num_classes,
                                               is_training=self.is_training)
            self.fpnMask = FpnMask('FpnMask',
                                   num_classes=self.num_classes,
                                   is_training=self.is_training)
示例#6
0
    def model(self):

        h, w = self.image_shape[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")
        if self.use_pretrained_model:

            c2, c3, c4, c5 = \
                self.end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block4']

        else:

            if callable(self.backbone):
                _, c2, c3, c4, c5 = self.backbone(self.input_image,
                                                  stage5=self.stage5,
                                                  is_training=self.is_training)

            else:
                _, c2, c3, c4, c5 = self.resnet(self.input_image)

        p2, p3, p4, p5, p6 = self.fpn([c2, c3, c4, c5])

        rpn_feature_maps = [p2, p3, p4, p5, p6]
        mrcnn_feature_maps = [p2, p3, p4, p5]

        if self.mode == 'training':
            anchors = self.get_anchors(self.image_shape)
            anchors = np.broadcast_to(anchors,
                                      (self.batch_size, ) + anchors.shape)
            anchors = tf.constant(anchors)
        else:
            anchors = self.input_anchors

        layer_outputs = []
        for p in rpn_feature_maps:
            layer_outputs.append(self.rpn(p))

        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            tf.concat(list(o), name=n, axis=1)
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs
        rpn_rois = self.proposal([rpn_class, rpn_bbox, anchors])

        if self.mode == 'training':
            active_class_ids = utils.parse_image_meta_graph(
                self.input_image_meta)['active_class_ids']

            if not self.use_rpn_rois:
                input_rois = tf.placeholder(
                    dtype=tf.int32,
                    shape=[None, self.post_nms_rois_training, 4],
                    name='input_rois')
                target_rois = utils.norm_boxes_graph(
                    input_rois,
                    tf.shape(self.input_image)[1:3])

            else:

                target_rois = rpn_rois

            rois, target_class_ids, target_bbox, target_mask = \
                self.targetDetection([target_rois, self.input_gt_class_ids,
                                      self.input_gt_boxes_normalized, self.input_gt_masks])

            pooled = self.pyramidRoiPooling([rois, self.input_image_meta] +
                                            mrcnn_feature_maps)
            pooled_mask = self.pyramidRoiPooling(
                [rois, self.input_image_meta] + mrcnn_feature_maps,
                pool_size=14)

            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier(
                pooled)
            mrcnn_mask = self.fpnMask(pooled_mask)

            output_rois = tf.identity(rois, name='output_rois')

            rpn_class_loss = layer.rpn_loss(self.input_rpn_match,
                                            rpn_class_logits)
            rpn_bbox_loss = layer.rpn_bbox_loss(self.input_rpn_boxes,
                                                self.input_rpn_match, rpn_bbox)
            class_loss = layer.mrcnn_class_loss(target_class_ids,
                                                mrcnn_class_logits,
                                                active_class_ids)
            bbox_loss = layer.mrcnn_bbox_loss(target_bbox, target_class_ids,
                                              mrcnn_bbox)
            mask_loss = layer.mrcnn_mask_loss(target_mask, target_class_ids,
                                              mrcnn_mask)

            tf.summary.scalar('rpn_class_loss', rpn_class_loss)
            tf.summary.scalar('rpn_bbox_loss', rpn_bbox_loss)
            tf.summary.scalar('mrcnn_class_loss', class_loss)
            tf.summary.scalar('mrcnn_bbox_loss', bbox_loss)
            tf.summary.scalar('mrcnn_mask_loss', mask_loss)

            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
        else:
            pooled = self.pyramidRoiPooling([rpn_rois, self.input_image_meta] +
                                            mrcnn_feature_maps)
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier(
                pooled)

            detections = self.objDetection(
                [rpn_rois, mrcnn_class, mrcnn_bbox, self.input_image_meta])

            detections_bbox = detections[..., :4]

            pooled = self.pyramidRoiPooling(
                [detections_bbox, self.input_image_meta] + mrcnn_feature_maps,
                pool_size=14)

            mrcnn_mask = self.fpnMask(pooled)

            outputs = [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ]

        return outputs