Пример #1
0
    def call(self, inputs):
        # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
        boxes = inputs[0]

        # Image meta
        # Holds details about the image. See compose_image_meta()
        image_meta = inputs[1]

        # Feature Maps. List of feature maps from different level of the
        # feature pyramid. Each is [batch, height, width, channels]
        feature_maps = inputs[2:]

        # Assign each ROI to a level in the pyramid based on the ROI area.
        y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
        h = y2 - y1
        w = x2 - x1
        # Use shape of first image. Images in a batch must have the same size.
        image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]
        # Equation 1 in the Feature Pyramid Networks paper. Account for
        # the fact that our coordinates are normalized here.
        # e.g. a 224x224 ROI (in pixels) maps to P4
        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
        roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
        roi_level = tf.minimum(
            5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
        roi_level = tf.squeeze(roi_level, 2)

        # Loop through levels and apply ROI pooling to each. P2 to P5.
        pooled = []
        box_to_level = []
        for i, level in enumerate(range(2, 6)):
            ix = tf.where(tf.equal(roi_level, level))
            level_boxes = tf.gather_nd(boxes, ix)

            # Box indices for crop_and_resize.
            box_indices = tf.cast(ix[:, 0], tf.int32)

            # Keep track of which box is mapped to which level
            box_to_level.append(ix)

            # Stop gradient propogation to ROI proposals
            level_boxes = tf.stop_gradient(level_boxes)
            box_indices = tf.stop_gradient(box_indices)

            # Crop and Resize
            # From Mask R-CNN paper: "We sample four regular locations, so
            # that we can evaluate either max or average pooling. In fact,
            # interpolating only a single value at each bin center (without
            # pooling) is nearly as effective."
            #
            # Here we use the simplified approach of a single value per bin,
            # which is how it's done in tf.crop_and_resize()
            # Result: [batch * num_boxes, pool_height, pool_width, channels]
            pooled.append(
                tf.image.crop_and_resize(feature_maps[i],
                                         level_boxes,
                                         box_indices,
                                         self.pool_shape,
                                         method="bilinear"))

        # Pack pooled features into one tensor
        pooled = tf.concat(pooled, axis=0)

        # Pack box_to_level mapping into one array and add another
        # column representing the order of pooled boxes
        box_to_level = tf.concat(box_to_level, axis=0)
        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
                                 axis=1)

        # Rearrange pooled features to match the order of the original boxes
        # Sort box_to_level by batch then box index
        # TF doesn't have a way to sort by two columns, so merge them and sort.
        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor,
                         k=tf.shape(box_to_level)[0]).indices[::-1]
        ix = tf.gather(box_to_level[:, 2], ix)
        pooled = tf.gather(pooled, ix)

        # Re-add the batch dimension
        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
        pooled = tf.reshape(pooled, shape)
        return pooled
Пример #2
0
    def build(self, mode: str, config: ShapesConfig):
        """Build Mask R-CNN architecture.
            input_shape: The shape of the input image.
            mode: Either "training" or "inference". The inputs and
                outputs of the model differ accordingly.
        """
        assert mode in ['training', 'inference']

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Inputs
        input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]],
                               name="input_image")

        input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
                                    name="input_image_meta")

        if mode == "training":
            # RPN GT
            input_rpn_match = KL.Input(shape=[None, 1],
                                       name="input_rpn_match",
                                       dtype=tf.int32)
            input_rpn_bbox = KL.Input(shape=[None, 4],
                                      name="input_rpn_bbox",
                                      dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = KL.Input(shape=[None],
                                          name="input_gt_class_ids",
                                          dtype=tf.int32)
            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = KL.Input(shape=[None, 4],
                                      name="input_gt_boxes",
                                      dtype=tf.float32)
            # Normalize coordinates
            gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(
                x,
                K.shape(input_image)[1:3]))(input_gt_boxes)
            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if config.USE_MINI_MASK:
                input_gt_masks = KL.Input(shape=[
                    config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None
                ],
                                          name="input_gt_masks",
                                          dtype=bool)
            else:
                input_gt_masks = KL.Input(
                    shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
                    name="input_gt_masks",
                    dtype=bool)
        elif mode == "inference":
            # Anchors in normalized coordinates
            input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
            anchors = input_anchors

        # Build the shared convolutional layers.
        # Bottom-up Layers
        # Returns a list of the last layers of each stage, 5 in total.
        # Don't create the thead (stage 5), so we pick the 4th item in the list.
        if callable(config.BACKBONE):
            _, C2, C3, C4, C5 = config.BACKBONE(input_image,
                                                stage5=True,
                                                train_bn=config.TRAIN_BN)
        else:
            _, C2, C3, C4, C5 = resnet_graph(input_image,
                                             config.BACKBONE,
                                             stage5=True,
                                             train_bn=config.TRAIN_BN)
        # Top-down Layers
        # TODO: add assert to varify feature map sizes match what's in config
        P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1),
                       name='fpn_c5p5')(C5)
        P4 = KL.Add(name="fpn_p4add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1),
                      name='fpn_c4p4')(C4)
        ])
        P3 = KL.Add(name="fpn_p3add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1),
                      name='fpn_c3p3')(C3)
        ])
        P2 = KL.Add(name="fpn_p2add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1),
                      name='fpn_c2p2')(C2)
        ])
        # Attach 3x3 conv to all P layers to get the final feature maps.
        P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                       padding="SAME",
                       name="fpn_p2")(P2)
        P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                       padding="SAME",
                       name="fpn_p3")(P3)
        P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                       padding="SAME",
                       name="fpn_p4")(P4)
        P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                       padding="SAME",
                       name="fpn_p5")(P5)
        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)

        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [P2, P3, P4, P5, P6]
        mrcnn_feature_maps = [P2, P3, P4, P5]

        # Anchors
        if mode == "training":
            anchors = self.get_anchors(config.IMAGE_SHAPE)
            # Duplicate across the batch dimension because Keras requires it
            # TODO: can this be optimized to avoid duplicating the anchors?
            anchors = np.broadcast_to(anchors,
                                      (config.BATCH_SIZE, ) + anchors.shape)
            # A hack to get around Keras's bad support for constants
            anchors = KL.Lambda(lambda x: tf.Variable(anchors),
                                name="anchors")(input_image)

        # RPN Model
        rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE,
                              len(config.RPN_ANCHOR_RATIOS),
                              config.TOP_DOWN_PYRAMID_SIZE)
        # Loop through pyramid layers
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(rpn([p]))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            KL.Concatenate(axis=1, name=n)(list(o))
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \
            else config.POST_NMS_ROIS_INFERENCE
        rpn_rois = ProposalLayer(proposal_count=proposal_count,
                                 nms_threshold=config.RPN_NMS_THRESHOLD,
                                 name="ROI",
                                 config=config)([rpn_class, rpn_bbox, anchors])

        if mode == "training":
            # Class ID mask to mark class IDs supported by the dataset the image
            # came from.
            active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x)[
                "active_class_ids"])(input_image_meta)

            if not config.USE_RPN_ROIS:
                # Ignore predicted ROIs and use ROIs provided as an input.
                input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
                                      name="input_roi",
                                      dtype=np.int32)
                # Normalize coordinates
                target_rois = KL.Lambda(lambda x: norm_boxes_graph(
                    x,
                    K.shape(input_image)[1:3]))(input_rois)
            else:
                target_rois = rpn_rois

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            rois, target_class_ids, target_bbox, target_mask = \
                DetectionTargetLayer(config, name="proposal_targets")([
                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

            # Network Heads
            # TODO: verify that this handles zero padded ROIs
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            mrcnn_mask = build_fpn_mask_graph(rois,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            # TODO: clean up (use tf.identify if necessary)
            output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)

            # Losses
            rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x),
                                       name="rpn_class_loss")(
                                           [input_rpn_match, rpn_class_logits])
            rpn_bbox_loss = KL.Lambda(
                lambda x: rpn_bbox_loss_graph(config, *x),
                name="rpn_bbox_loss")(
                    [input_rpn_bbox, input_rpn_match, rpn_bbox])
            class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x),
                                   name="mrcnn_class_loss")([
                                       target_class_ids, mrcnn_class_logits,
                                       active_class_ids
                                   ])
            bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x),
                                  name="mrcnn_bbox_loss")([
                                      target_bbox, target_class_ids, mrcnn_bbox
                                  ])
            mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x),
                                  name="mrcnn_mask_loss")([
                                      target_mask, target_class_ids, mrcnn_mask
                                  ])

            # Model
            inputs = [
                input_image, input_image_meta, input_rpn_match, input_rpn_bbox,
                input_gt_class_ids, input_gt_boxes, input_gt_masks
            ]
            if not config.USE_RPN_ROIS:
                inputs.append(input_rois)
            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
            model = KM.Model(inputs, outputs, name='mask_rcnn')
        else:
            # Network Heads
            # Proposal classifier and BBox regressor heads
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            # Detections
            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
            # normalized coordinates
            detections = DetectionLayer(config, name="mrcnn_detection")(
                [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])

            # Create masks for detections
            detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
            mrcnn_mask = build_fpn_mask_graph(detection_boxes,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            model = KM.Model([input_image, input_image_meta, input_anchors], [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ],
                             name='mask_rcnn')

        # Add multi-GPU support.
        if config.GPU_COUNT > 1:
            from mrcnn.parallel_model import ParallelModel
            model = ParallelModel(model, config.GPU_COUNT)

        return model
Пример #3
0
    def build(self, mode, config):
        """Build the Mask R-CNN teacher-student architecture for mimic training.
        """
        assert mode in ['training', 'inference']

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2 ** 6 != int(h / 2 ** 6) or w / 2 ** 6 != int(w / 2 ** 6):
            raise Exception("Image size must be dividable by 2 at least 6 times "
                            "to avoid fractions when downscaling and upscaling."
                            "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Input
        input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
        input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta")

        if mode == "training":
            # RPN GT
            input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
            input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32)

            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
            # Normalize coordinates
            gt_boxes = KL.Lambda(lambda x: modellib.norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_gt_boxes)

            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if config.USE_MINI_MASK:
                input_gt_masks = KL.Input(
                    shape=[config.MINI_MASK_SHAPE[0],
                           config.MINI_MASK_SHAPE[1], None],
                    name="input_gt_masks", dtype=bool)
            else:
                input_gt_masks = KL.Input(
                    shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
                    name="input_gt_masks", dtype=bool)

            # Class ID mask to mark class IDs supported by the dataset the image came from.
            active_class_ids = KL.Lambda(
                lambda x: modellib.parse_image_meta_graph(x)["active_class_ids"]
            )(input_image_meta)

        elif mode == "inference":
            pass

        # Build the architecture of the teacher model
        # Backbone
        _, C2, C3, C4, C5 = modellib.resnet_graph(input_image, config.TEACHER_BACKBONE, stage5=True, train_bn=False)
        # Top-down Layers
        P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
        P4 = KL.Add(name="fpn_p4add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
        P3 = KL.Add(name="fpn_p3add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
        P2 = KL.Add(name="fpn_p2add")([
            KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
        # Attach 3x3 conv to all P layers to get the final feature maps.
        P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)  # N x 256 x 256 x 256
        P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)  # N x 128 x 128 x 256
        P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)  # N x 64 x 64 x 256
        P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)  # N x 32 x 32 x 256

        # Note that P6 is used in RPN, but not in the classifier heads.
        t_mrcnn_feature_maps = [P2, P3, P4, P5]

        # Build the architecture of the student model
        s_prefix = 's_'
        # Backbone
        _, S2, S3, S4, S5 = s_resnet_graph(input_image, config.STUDENT_BACKBONE, prefix=s_prefix, train_bn=None)
        # Top-down Layers
        Q5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s5q5')(S5)
        Q4 = KL.Add(name=s_prefix + "fpn_q4add")([
            KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q5upsampled")(Q5),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s4q4')(S4)])
        Q3 = KL.Add(name=s_prefix + "fpn_q3add")([
            KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q4upsampled")(Q4),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s3q3')(S3)])
        Q2 = KL.Add(name=s_prefix + "fpn_q2add")([
            KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q3upsampled")(Q3),
            KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s2q2')(S2)])
        # Attach 3x3 conv to all Q layers to get the final feature maps.
        Q2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q2")(
            Q2)  # N x 256 x 256 x 256
        Q3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q3")(
            Q3)  # N x 128 x 128 x 256
        Q4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q4")(
            Q4)  # N x 64 x 64 x 256
        Q5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q5")(
            Q5)  # N x 32 x 32 x 256
        # Q6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        Q6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name=s_prefix + "fpn_q6")(Q5)

        # Note that P6 is used in RPN, but not in the classifier heads.
        s_rpn_feature_maps = [Q2, Q3, Q4, Q5, Q6]
        s_mrcnn_feature_maps = [Q2, Q3, Q4, Q5]

        # RPN Model
        s_rpn = s_build_rpn_model(config.RPN_ANCHOR_STRIDE,
                                  len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE, prefix=s_prefix)
        # Loop through pyramid layers
        s_layer_outputs = []  # list of lists
        for p in s_rpn_feature_maps:
            s_layer_outputs.append(s_rpn([p]))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        s_output_names = ["s_rpn_class_logits", "s_rpn_class", "s_rpn_bbox"]
        s_outputs = list(zip(*s_layer_outputs))
        s_outputs = [KL.Concatenate(axis=1, name=n)(list(o))
                     for o, n in zip(s_outputs, s_output_names)]

        s_rpn_class_logits, s_rpn_class, s_rpn_bbox = s_outputs

        # Proposals
        anchors = self.get_anchors(config.IMAGE_SHAPE)
        # Duplicate across the batch dimension because Keras requires it
        # TODO: can this be optimized to avoid duplicating the anchors?
        anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
        # A hack to get around Keras's bad support for constants
        anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \
            else config.POST_NMS_ROIS_INFERENCE
        s_rpn_rois = s_ProposalLayer(
            proposal_count=proposal_count,
            nms_threshold=config.RPN_NMS_THRESHOLD,
            name="s_ROI",
            config=config)([s_rpn_class, s_rpn_bbox, anchors])

        # Generate detection targets
        # Subsamples proposals and generates target outputs for training
        # Note that proposal class IDs, gt_boxes, and gt_masks are zero
        # padded. Equally, returned rois and targets are zero padded.
        s_rois, target_class_ids, target_bbox, target_mask = \
            modellib.DetectionTargetLayer(config, name="proposal_targets")([
                s_rpn_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
        # s_rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates

        # Add a transformer layer to the feature maps from the student model before comparison with the teacher model
        transformer = build_transformer_layer(config.TOP_DOWN_PYRAMID_SIZE)
        s_transformed_feature_maps = [transformer([p]) for p in s_mrcnn_feature_maps]

        # Losses
        rpn_class_loss = KL.Lambda(lambda x: modellib.rpn_class_loss_graph(*x), name="rpn_class_loss")(
            [input_rpn_match, s_rpn_class_logits])
        rpn_bbox_loss = KL.Lambda(lambda x: modellib.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
            [input_rpn_bbox, input_rpn_match, s_rpn_bbox])
        rpn_mimic_loss = KL.Lambda(lambda x: rpn_mimic_loss_graph(config, *x), name="rpn_mimic_loss")(
            [s_transformed_feature_maps[0], s_transformed_feature_maps[1],
             s_transformed_feature_maps[2], s_transformed_feature_maps[3],
             t_mrcnn_feature_maps[0], t_mrcnn_feature_maps[1],
             t_mrcnn_feature_maps[2], t_mrcnn_feature_maps[3],
             s_rois, input_image_meta])

        # Model
        inputs = [input_image, input_image_meta, input_rpn_match, input_rpn_bbox,
                  input_gt_class_ids, input_gt_boxes, input_gt_masks]
        outputs = [s_rpn_class_logits, s_rpn_class, s_rpn_bbox,
                   s_rpn_rois, rpn_class_loss, rpn_bbox_loss, rpn_mimic_loss]
        model = KM.Model(inputs, outputs, name='mimic_mask_rcnn')

        # Add multi-GPU support.
        if config.GPU_COUNT > 1:
            from mrcnn.parallel_model import ParallelModel
            model = ParallelModel(model, config.GPU_COUNT)

        return model
Пример #4
0
    def build(self, mode, config):
        """Build Mask R-CNN architecture.
            input_shape: The shape of the input image.
            mode: Either "training" or "inference". The inputs and
                outputs of the model differ accordingly.
        """
        assert mode in ['training', 'inference']

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception("Image size must be dividable by 2 at least 6 times "
                            "to avoid fractions when downscaling and upscaling."
                            "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Inputs
        input_image = KL.Input(
            shape=config.IMAGE_SHAPE.tolist(), name="input_image")
        # CHANGE: add target input
        if not config.NUM_TARGETS:
            config.NUM_TARGETS = 1
        input_target = KL.Input(
            shape=[config.NUM_TARGETS] + config.TARGET_SHAPE.tolist(), name="input_target")
        input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
                                    name="input_image_meta")
        if mode == "training":
            # RPN GT
            input_rpn_match = KL.Input(
                shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
            input_rpn_bbox = KL.Input(
                shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = KL.Input(
                shape=[None], name="input_gt_class_ids", dtype=tf.int32)
            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = KL.Input(
                shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
            # Normalize coordinates
            gt_boxes = KL.Lambda(lambda x: modellib.norm_boxes_graph(
                x, K.shape(input_image)[1:3]))(input_gt_boxes)
            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if config.USE_MINI_MASK:
                input_gt_masks = KL.Input(
                    shape=[config.MINI_MASK_SHAPE[0],
                           config.MINI_MASK_SHAPE[1], None],
                    name="input_gt_masks", dtype=bool)
            else:
                input_gt_masks = KL.Input(
                    shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
                    name="input_gt_masks", dtype=bool)
        elif mode == "inference":
            # Anchors in normalized coordinates
            input_anchors = KL.Input(shape=[None, 4], name="input_anchors")

        # Build the shared convolutional layers.
        # CHANGE: Use weightshared FPN model for image and target
        # Create FPN Model
        resnet = build_resnet_model(self.config)
        fpn = build_fpn_model(feature_maps=self.config.FPN_FEATUREMAPS)
        # Create Image FP
        _, IC2, IC3, IC4, IC5 = resnet(input_image)
        IP2, IP3, IP4, IP5, IP6 = fpn([IC2, IC3, IC4, IC5])
        # Create Target FR
        input_targets = [KL.Lambda(lambda x: x[:,idx,...])(input_target) for idx in range(input_target.shape[1])]
        for k, one_target in enumerate(input_targets):
            _, TC2, TC3, TC4, TC5 = resnet(one_target)
            out = fpn([TC2, TC3, TC4, TC5])
            if k == 0:
                target_pyramid = out
            else:
                target_pyramid = [KL.Add(name="target_adding_{}_{}".format(k, i))(
                    [target_pyramid[i], out[i]]) for i in range(len(out))]
                
        TP2, TP3, TP4, TP5, TP6 = [KL.Lambda(lambda x: x / config.NUM_TARGETS)(
            target_pyramid[i]) for i in range(len(target_pyramid))]
#        one_target = KL.Lambda(lambda x: x[:,0,...])(input_target)
#        one_target = input_target[:,0,...]
#         _, TC2, TC3, TC4, TC5 = resnet(one_target)
#         TP2, TP3, TP4, TP5, TP6 = fpn([TC2, TC3, TC4, TC5])
    
        
        # CHANGE: add siamese distance copmputation
        # Combine FPs using L1 distance
        P2 = l1_distance_graph(IP2, TP2, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P2')
        P3 = l1_distance_graph(IP3, TP3, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P3')
        P4 = l1_distance_graph(IP4, TP4, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P4')
        P5 = l1_distance_graph(IP5, TP5, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P5')
        P6 = l1_distance_graph(IP6, TP6, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P6')
        

        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [P2, P3, P4, P5, P6]
        mrcnn_feature_maps = [P2, P3, P4, P5]

        # Anchors
        if mode == "training":
            anchors = self.get_anchors(config.IMAGE_SHAPE)
            # Duplicate across the batch dimension because Keras requires it
            # TODO: can this be optimized to avoid duplicating the anchors?
            anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
            # A hack to get around Keras's bad support for constants
            anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
        else:
            anchors = input_anchors

        # RPN Model
        # CHANGE: Set number of filters to [3*self.config.FPN_FEATUREMAPS//2]
        rpn = modellib.build_rpn_model(config.RPN_ANCHOR_STRIDE,
                              len(config.RPN_ANCHOR_RATIOS), 3*self.config.FPN_FEATUREMAPS//2)
        # Loop through pyramid layers
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(rpn([p]))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [KL.Concatenate(axis=1, name=n)(list(o))
                   for o, n in zip(outputs, output_names)]

        rpn_class_logits, rpn_class, rpn_bbox = outputs

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
            else config.POST_NMS_ROIS_INFERENCE
        rpn_rois = modellib.ProposalLayer(
            proposal_count=proposal_count,
            nms_threshold=config.RPN_NMS_THRESHOLD,
            name="ROI",
            config=config)([rpn_class, rpn_bbox, anchors])

        if mode == "training":
            # Class ID mask to mark class IDs supported by the dataset the image
            # came from.
            active_class_ids = KL.Lambda(
                lambda x: modellib.parse_image_meta_graph(x)["active_class_ids"]
                )(input_image_meta)

            if not config.USE_RPN_ROIS:
                # Ignore predicted ROIs and use ROIs provided as an input.
                input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
                                      name="input_roi", dtype=np.int32)
                # Normalize coordinates
                target_rois = KL.Lambda(lambda x: modellig.norm_boxes_graph(
                    x, K.shape(input_image)[1:3]))(input_rois)
            else:
                target_rois = rpn_rois

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            rois, target_class_ids, target_bbox, target_mask =\
                modellib.DetectionTargetLayer(config, name="proposal_targets")([
                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

            # Network Heads
            # TODO: verify that this handles zero padded ROIs
            # CHANGE: reduce number of classes to 2
            # CHANGE: replaced with custom 2 class function
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
                fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, num_classes=2,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
            # CHANGE: reduce number of classes to 2
            # CHANGE: replaced with custom 2 class function
            if config.MODEL == 'mrcnn':
                mrcnn_mask = fpn_mask_graph(rois, mrcnn_feature_maps,
                                                  input_image_meta,
                                                  config.MASK_POOL_SIZE,
                                                  num_classes=2,
                                                  train_bn=config.TRAIN_BN)

            # TODO: clean up (use tf.identify if necessary)
            output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)

            # Losses
            rpn_class_loss = KL.Lambda(lambda x: modellib.rpn_class_loss_graph(*x), name="rpn_class_loss")(
                [input_rpn_match, rpn_class_logits])
            rpn_bbox_loss = KL.Lambda(lambda x: modellib.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
                [input_rpn_bbox, input_rpn_match, rpn_bbox])
            # CHANGE: use custom class loss without using active_class_ids
            class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
                [target_class_ids, mrcnn_class_logits, active_class_ids])
            bbox_loss = KL.Lambda(lambda x: modellib.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
                [target_bbox, target_class_ids, mrcnn_bbox])
            if config.MODEL == 'mrcnn':
                mask_loss = KL.Lambda(lambda x: modellib.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
                    [target_mask, target_class_ids, mrcnn_mask])

            # Model
            # CHANGE: Added target to inputs
            inputs = [input_image, input_image_meta, input_target,
                      input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
            if not config.USE_RPN_ROIS:
                inputs.append(input_rois)
            if config.MODEL == 'mrcnn':
                outputs = [rpn_class_logits, rpn_class, rpn_bbox,
                           mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
                           rpn_rois, output_rois,
                           rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
            elif config.MODEL =='frcnn':
                outputs = [rpn_class_logits, rpn_class, rpn_bbox,
                           mrcnn_class_logits, mrcnn_class, mrcnn_bbox,
                           rpn_rois, output_rois,
                           rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss]
            model = KM.Model(inputs, outputs, name='mask_rcnn')
        else:
            # Network Heads
            # Proposal classifier and BBox regressor heads
            # CHANGE: reduce number of classes to 2
            # CHANGE: replaced with custom 2 class function
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
                fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, num_classes=2,
                                     train_bn=config.TRAIN_BN, 
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            # Detections
            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in 
            # normalized coordinates
            detections = modellib.DetectionLayer(config, name="mrcnn_detection")(
                [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])

            # Create masks for detections
            detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
            # CHANGE: reduce number of classes to 2
            # CHANGE: replaced with custom 2 class function
            if config.MODEL == 'mrcnn':
                mrcnn_mask = fpn_mask_graph(detection_boxes, mrcnn_feature_maps,
                                                  input_image_meta,
                                                  config.MASK_POOL_SIZE,
                                                  num_classes=2,
                                                  train_bn=config.TRAIN_BN)
            
            # CHANGE: Added target to the input
            inputs = [input_image, input_image_meta, input_target, input_anchors]
            if config.MODEL == 'mrcnn':
                outputs = [detections, mrcnn_class, mrcnn_bbox,
                           mrcnn_mask, rpn_rois, rpn_class, rpn_bbox]
            elif config.MODEL =='frcnn':
                outputs = [detections, mrcnn_class, mrcnn_bbox,
                           rpn_rois, rpn_class, rpn_bbox]
            model = KM.Model(inputs, outputs, name='mask_rcnn')
            

        # Add multi-GPU support.
        if config.GPU_COUNT > 1:
            from mrcnn.parallel_model import ParallelModel
            model = ParallelModel(model, config.GPU_COUNT)

        return model
Пример #5
0
def rpn_mimic_loss_graph(config, Q2, Q3, Q4, Q5, P2, P3, P4, P5, boxes, image_meta):
    student_maps = [Q2, Q3, Q4, Q5]
    teacher_maps = [P2, P3, P4, P5]
    num_boxes_batch = config.BATCH_SIZE * config.TRAIN_ROIS_PER_IMAGE

    # Assign each ROI to a level in the pyramid based on the ROI area.
    y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)  # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE, 1]
    h = y2 - y1
    w = x2 - x1
    # Use shape of first image. Images in a batch must have the same size.
    image_shape = modellib.parse_image_meta_graph(image_meta)['image_shape'][0]
    # Equation 1 in the Feature Pyramid Networks paper. Account for
    # the fact that our coordinates are normalized here.
    # e.g. a 224x224 ROI (in pixels) maps to P4
    image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
    roi_level = modellib.log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
    roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))  # 4 + log2(roi_h * roi_w)
    roi_level = tf.squeeze(roi_level, 2)  # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE]

    # init the loss for later accumulation of every pyramid level
    loss = 0.

    for i, level in enumerate(range(2, 6)):
        ix = tf.where(tf.equal(roi_level, level))  # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE]
        level_boxes = tf.gather_nd(boxes, ix)  # [num_boxes_level, 4]

        # Box indices for crop_and_resize (specify which image the bbox refers to)
        box_indices = tf.cast(ix[:, 0], tf.int32)  # [num_boxes_level]

        P = num_boxes_batch - tf.shape(level_boxes)[0]
        level_boxes = tf.pad(level_boxes, [(0, P), (0, 0)])  # padded with zero
        box_indices = tf.pad(box_indices, [(0, P)])  # padded with zero

        level_y1, level_x1, level_y2, level_x2 = tf.split(level_boxes, 4, axis=-1)
        map_h = image_shape[0] / config.BACKBONE_STRIDES[i]
        map_w = image_shape[1] / config.BACKBONE_STRIDES[i]

        # calculates the coordinate of bounding boxes at the pyramid level
        level_y1_p = tf.cast(tf.squeeze(tf.floor(level_y1 * map_h), axis=-1), tf.int32)  # [num_boxes_batch]
        level_x1_p = tf.cast(tf.squeeze(tf.floor(level_x1 * map_w), axis=-1), tf.int32)
        level_y2_p = tf.cast(tf.squeeze(tf.ceil(level_y2 * map_h), axis=-1), tf.int32)
        level_x2_p = tf.cast(tf.squeeze(tf.ceil(level_x2 * map_w), axis=-1), tf.int32)
        level_h_p = level_y2_p - level_y1_p
        level_w_p = level_x2_p - level_x1_p

        # iterate over bounding boxes and accumulate the l2-distance loss
        def cond(idx, _):
            # return tf.less(idx, level_boxes.shape[0])
            return tf.less(idx, num_boxes_batch)

        def body(idx, loss):
            s_cropped = tf.slice(student_maps[i], [box_indices[idx], level_y1_p[idx], level_x1_p[idx], 0],
                                 [1, level_h_p[idx], level_w_p[idx], -1])
            t_cropped = tf.slice(teacher_maps[i], [box_indices[idx], level_y1_p[idx], level_x1_p[idx], 0],
                                 [1, level_h_p[idx], level_w_p[idx], -1])
            loss += tf.reduce_mean(tf.square(s_cropped - t_cropped))
            idx = tf.add(idx, 1)
            return [idx, loss]

        loss_temp = tf.while_loop(cond, body, [0, 0.])
        loss += loss_temp[1]

    loss = loss / (2.0 * num_boxes_batch)

    return loss