def call(self, inputs):
        rois = inputs[0]
        mrcnn_class = inputs[1]
        mrcnn_bbox = inputs[2]
        image_meta = inputs[3]

        # Get windows of images in normalized coordinates. Windows are the area
        # in the image that excludes the padding.
        # Use the shape of the first image in the batch to normalize the window
        # because we know that all images get resized to the same size.
        m = utils.parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = utils.norm_boxes_graph(m['window'], image_shape[:2])

        # Run detection refinement graph on each item in the batch
        detections_batch = utils.batch_slice(
            [rois, mrcnn_class, mrcnn_bbox, window],
            lambda x, y, w, z: refine_detections_graph(x, y, w, z, 
            self.bbox_std_dev, 
            self.detection_min_confidence,
            self.detection_max_instance,
            self.detection_nms_threshold),
            self.count_image_per_gpu)

        # Reshape output
        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
        # normalized coordinates
        return tf.reshape(
            detections_batch,
            [self.size_batch, self.detection_max_instance, 6])
Пример #2
0
    def call(self, inputs):
        rois = inputs[0]
        mrcnn_class = inputs[1]
        mrcnn_bbox = inputs[2]
        image_meta = inputs[3]

        # Get windows of images in normalized coordinates. Windows are the area
        # in the image that excludes the padding.
        # Use the shape of the first image in the batch to normalize the window
        # because we know that all images get resized to the same size.
        m = parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = norm_boxes_graph(m['window'], image_shape[:2])

        # Run detection refinement graph on each item in the batch
        detections_batch = utils.batch_slice([
            rois, mrcnn_class, mrcnn_bbox, window
        ], lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
                                             self.config.IMAGES_PER_GPU)

        # Reshape output
        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
        # normalized coordinates
        return tf.reshape(
            detections_batch,
            [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
Пример #3
0
    def call(self, inputs):
        boxes = inputs[0]

        image_meta = inputs[1]

        feature_maps = inputs[2:]

        y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)

        h = y2 - y1

        w = x2 - x1

        image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]

        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)

        roi_level = log2_graph(tf.sqrt(h*w) / (224.0 / tf.sqrt(image_area)))
        roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))

        roi_level = tf.squeeze(roi_level, 2)

        pooled = []

        box_to_level = []

        for i, level in enumerate(range(2, 6)):
            ix = tf.where(tf.equal(roi_level, level))
            level_boxes = tf.gather_nd(boxes, ix)

            box_indices = tf.cast(ix[:, 0], tf.int32)

            box_to_level.append(ix)

            level_boxes = tf.stop_gradient(level_boxes)

            box_indices = tf.stop_gradient(box_indices)

            pooled.append(
                tf.image.crop_and_resize(
                    feature_maps[i],
                    level_boxes, box_indices, self.pool_shape, method="bilinear"))

        pooled = tf.concat(pooled, axis=0)

        box_to_level = tf.concat(box_to_level, axis=0)

        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)

        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1)

        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1]

        pooled = tf.gather(pooled, ix)
        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)

        pooled = tf.reshape(pooled, shape)

        return pooled
Пример #4
0
    def __call__(self, ipt):
        rois = ipt[0]
        mrcnn_class = ipt[1]
        mrcnn_bbox = ipt[2]
        image_meta = ipt[3]

        m = utils.parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = utils.norm_boxes(m['window'], image_shape[:2])

        detections_batch = utils.batch_slice(
            [rois, mrcnn_class, mrcnn_bbox, window],
            lambda w, x, y, z: layer.refine_detections(w, x, y, z),
            self.image_per_gpu)

        return tf.reshape(
            detections_batch,
            [self.image_per_gpu, self.detection_max_instances, 6])
    def call(self, inputs):
        # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
        boxes = inputs[0]

        # Image meta
        # Holds details about the image. See compose_image_meta()
        image_meta = inputs[1]

        # Feature Maps. List of feature maps from different level of the
        # feature pyramid. Each is [batch, height, width, channels]
        feature_maps = inputs[2:]

        # Assign each ROI to a level in the pyramid based on the ROI area.
        y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
        h = y2 - y1
        w = x2 - x1
        # Use shape of first image. Images in a batch must have the same size.
        image_shape = utils.parse_image_meta_graph(image_meta)['image_shape'][0]
        # Equation 1 in the Feature Pyramid Networks paper. Account for
        # the fact that our coordinates are normalized here.
        # e.g. a 224x224 ROI (in pixels) maps to P4
        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
        roi_level = self.log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
        roi_level = tf.minimum(5, tf.maximum(
            2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
        roi_level = tf.squeeze(roi_level, 2)

        # Loop through levels and apply ROI pooling to each. P2 to P5.
        pooled = []
        box_to_level = []
        for i, level in enumerate(range(2, 6)):
            ix = tf.where(tf.equal(roi_level, level))
            level_boxes = tf.gather_nd(boxes, ix)

            # Box indices for crop_and_resize.
            box_indices = tf.cast(ix[:, 0], tf.int32)

            # Keep track of which box is mapped to which level
            box_to_level.append(ix)

            # Stop gradient propogation to ROI proposals
            level_boxes = tf.stop_gradient(level_boxes)
            box_indices = tf.stop_gradient(box_indices)

            # Crop and Resize
            # From Mask R-CNN paper: "We sample four regular locations, so
            # that we can evaluate either max or average pooling. In fact,
            # interpolating only a single value at each bin center (without
            # pooling) is nearly as effective."
            #
            # Here we use the simplified approach of a single value per bin,
            # which is how it's done in tf.crop_and_resize()
            # Result: [batch * num_boxes, pool_height, pool_width, channels]
            pooled.append(tf.image.crop_and_resize(
                feature_maps[i], level_boxes, box_indices, self.pool_shape,
                method="bilinear"))

        # Pack pooled features into one tensor
        pooled = tf.concat(pooled, axis=0)

        # Pack box_to_level mapping into one array and add another
        # column representing the order of pooled boxes
        box_to_level = tf.concat(box_to_level, axis=0)
        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
                                 axis=1)

        # Rearrange pooled features to match the order of the original boxes
        # Sort box_to_level by batch then box index
        # TF doesn't have a way to sort by two columns, so merge them and sort.
        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
            box_to_level)[0]).indices[::-1]
        ix = tf.gather(box_to_level[:, 2], ix)
        pooled = tf.gather(pooled, ix)

        # Re-add the batch dimension
        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
        pooled = tf.reshape(pooled, shape)
        return pooled
Пример #6
0
    def build(self, mode, config):
        """Build Mask R-CNN architecture.
            input_shape: The shape of the input image.
            mode: Either "training" or "inference". The inputs and
                outputs of the model differ accordingly.
        """
        assert mode in ['training', 'inference']

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Inputs
        input_image = Input(shape=[None, None, config.IMAGE_SHAPE[2]],
                            name="input_image")
        input_image_meta = Input(shape=[config.IMAGE_META_SIZE],
                                 name="input_image_meta")
        if mode == "training":
            # RPN GT
            input_rpn_match = Input(shape=[None, 1],
                                    name="input_rpn_match",
                                    dtype=tf.int32)
            input_rpn_bbox = Input(shape=[None, 4],
                                   name="input_rpn_bbox",
                                   dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = Input(shape=[None],
                                       name="input_gt_class_ids",
                                       dtype=tf.int32)
            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = Input(shape=[None, 4],
                                   name="input_gt_boxes",
                                   dtype=tf.float32)
            # Normalize coordinates
            gt_boxes = Lambda(lambda x: norm_boxes_graph(
                x,
                K.shape(input_image)[1:3]))(input_gt_boxes)
            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if config.USE_MINI_MASK:
                input_gt_masks = Input(shape=[
                    config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None
                ],
                                       name="input_gt_masks",
                                       dtype=bool)
            else:
                input_gt_masks = Input(
                    shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
                    name="input_gt_masks",
                    dtype=bool)
        elif mode == "inference":
            # Anchors in normalized coordinates
            input_anchors = Input(shape=[None, 4], name="input_anchors")

        # Build the shared convolutional layers.
        # Bottom-up Layers
        # Returns a list of the last layers of each stage, 5 in total.
        # Don't create the thead (stage 5), so we pick the 4th item in the list.
        if callable(config.BACKBONE):
            _, C2, C3, C4, C5 = config.BACKBONE(input_image,
                                                stage5=True,
                                                train_bn=config.TRAIN_BN)
        else:
            _, C2, C3, C4, C5 = resnet_graph(input_image,
                                             config.BACKBONE,
                                             stage5=True,
                                             train_bn=config.TRAIN_BN)
        # Top-down Layers
        # TODO: add assert to varify feature map sizes match what's in configs
        P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
        P4 = add([
            UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)
        ])
        P3 = add([
            UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)
        ])
        P2 = add([
            UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
            Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)
        ])
        # Attach 3x3 conv to all P layers to get the final feature maps.
        P2 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p2")(P2)
        P3 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p3")(P3)
        P4 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p4")(P4)
        P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3),
                    padding="SAME",
                    name="fpn_p5")(P5)
        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        P6 = MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)

        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [P2, P3, P4, P5, P6]
        mrcnn_feature_maps = [P2, P3, P4, P5]

        # Anchors
        if mode == "training":
            anchors = self.get_anchors(config.IMAGE_SHAPE)
            # Duplicate across the batch dimension because Keras requires it
            # TODO: can this be optimized to avoid duplicating the anchors?
            anchors = np.broadcast_to(anchors,
                                      (config.BATCH_SIZE, ) + anchors.shape)
            # A hack to get around Keras's bad support for constants
            anchors = Lambda(lambda x: tf.Variable(anchors),
                             name="anchors")(input_image)
        else:
            anchors = input_anchors

        # RPN Model
        rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE,
                              len(config.RPN_ANCHOR_RATIOS),
                              config.TOP_DOWN_PYRAMID_SIZE)
        # Loop through pyramid layers
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(rpn([p]))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            Concatenate(axis=1, name=n)(list(o))
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \
            else config.POST_NMS_ROIS_INFERENCE
        rpn_rois = ProposalLayer(proposal_count=proposal_count,
                                 nms_threshold=config.RPN_NMS_THRESHOLD,
                                 name="ROI",
                                 config=config)([rpn_class, rpn_bbox, anchors])

        if mode == "training":
            # Class ID mask to mark class IDs supported by the dataset the image
            # came from.
            active_class_ids = Lambda(lambda x: parse_image_meta_graph(x)[
                "active_class_ids"])(input_image_meta)

            if not config.USE_RPN_ROIS:
                # Ignore predicted ROIs and use ROIs provided as an input.
                input_rois = Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
                                   name="input_roi",
                                   dtype=np.int32)
                # Normalize coordinates
                target_rois = Lambda(lambda x: norm_boxes_graph(
                    x,
                    K.shape(input_image)[1:3]))(input_rois)
            else:
                target_rois = rpn_rois

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            rois, target_class_ids, target_bbox, target_mask = \
                DetectionTargetLayer(config, name="proposal_targets")([
                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

            # Network Heads
            # TODO: verify that this handles zero padded ROIs
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            mrcnn_mask = build_fpn_mask_graph(rois,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            # TODO: clean up (use tf.identify if necessary)
            output_rois = Lambda(lambda x: x * 1, name="output_rois")(rois)

            # Losses
            rpn_class_loss = Lambda(lambda x: rpn_class_loss_graph(*x),
                                    name="rpn_class_loss")(
                                        [input_rpn_match, rpn_class_logits])
            rpn_bbox_loss = Lambda(lambda x: rpn_bbox_loss_graph(config, *x),
                                   name="rpn_bbox_loss")([
                                       input_rpn_bbox, input_rpn_match,
                                       rpn_bbox
                                   ])
            class_loss = Lambda(lambda x: mrcnn_class_loss_graph(*x),
                                name="mrcnn_class_loss")([
                                    target_class_ids, mrcnn_class_logits,
                                    active_class_ids
                                ])
            bbox_loss = Lambda(lambda x: mrcnn_bbox_loss_graph(*x),
                               name="mrcnn_bbox_loss")(
                                   [target_bbox, target_class_ids, mrcnn_bbox])
            mask_loss = Lambda(lambda x: mrcnn_mask_loss_graph(*x),
                               name="mrcnn_mask_loss")(
                                   [target_mask, target_class_ids, mrcnn_mask])

            # Model
            inputs = [
                input_image, input_image_meta, input_rpn_match, input_rpn_bbox,
                input_gt_class_ids, input_gt_boxes, input_gt_masks
            ]
            if not config.USE_RPN_ROIS:
                inputs.append(input_rois)
            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
            model = Model(inputs, outputs, name='mask_rcnn')
        else:
            # Network Heads
            # Proposal classifier and BBox regressor heads
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \
                fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
                                     config.POOL_SIZE, config.NUM_CLASSES,
                                     train_bn=config.TRAIN_BN,
                                     fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)

            # Detections
            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
            # normalized coordinates
            detections = DetectionLayer(config, name="mrcnn_detection")(
                [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])

            # Create masks for detections
            detection_boxes = Lambda(lambda x: x[..., :4])(detections)
            mrcnn_mask = build_fpn_mask_graph(detection_boxes,
                                              mrcnn_feature_maps,
                                              input_image_meta,
                                              config.MASK_POOL_SIZE,
                                              config.NUM_CLASSES,
                                              train_bn=config.TRAIN_BN)

            model = Model([input_image, input_image_meta, input_anchors], [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ],
                          name='mask_rcnn')

        # Add multi-GPU support.
        # if configs.GPU_COUNT > 1:
        #     from mrcnn.parallel_model import ParallelModel
        #     model = ParallelModel(model, configs.GPU_COUNT)

        return model
Пример #7
0
    def model(self):

        h, w = self.image_shape[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")
        if self.use_pretrained_model:

            c2, c3, c4, c5 = \
                self.end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], \
                self.end_points['resnet_v2_50/block4']

        else:

            if callable(self.backbone):
                _, c2, c3, c4, c5 = self.backbone(self.input_image,
                                                  stage5=self.stage5,
                                                  is_training=self.is_training)

            else:
                _, c2, c3, c4, c5 = self.resnet(self.input_image)

        p2, p3, p4, p5, p6 = self.fpn([c2, c3, c4, c5])

        rpn_feature_maps = [p2, p3, p4, p5, p6]
        mrcnn_feature_maps = [p2, p3, p4, p5]

        if self.mode == 'training':
            anchors = self.get_anchors(self.image_shape)
            anchors = np.broadcast_to(anchors,
                                      (self.batch_size, ) + anchors.shape)
            anchors = tf.constant(anchors)
        else:
            anchors = self.input_anchors

        layer_outputs = []
        for p in rpn_feature_maps:
            layer_outputs.append(self.rpn(p))

        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            tf.concat(list(o), name=n, axis=1)
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs
        rpn_rois = self.proposal([rpn_class, rpn_bbox, anchors])

        if self.mode == 'training':
            active_class_ids = utils.parse_image_meta_graph(
                self.input_image_meta)['active_class_ids']

            if not self.use_rpn_rois:
                input_rois = tf.placeholder(
                    dtype=tf.int32,
                    shape=[None, self.post_nms_rois_training, 4],
                    name='input_rois')
                target_rois = utils.norm_boxes_graph(
                    input_rois,
                    tf.shape(self.input_image)[1:3])

            else:

                target_rois = rpn_rois

            rois, target_class_ids, target_bbox, target_mask = \
                self.targetDetection([target_rois, self.input_gt_class_ids,
                                      self.input_gt_boxes_normalized, self.input_gt_masks])

            pooled = self.pyramidRoiPooling([rois, self.input_image_meta] +
                                            mrcnn_feature_maps)
            pooled_mask = self.pyramidRoiPooling(
                [rois, self.input_image_meta] + mrcnn_feature_maps,
                pool_size=14)

            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier(
                pooled)
            mrcnn_mask = self.fpnMask(pooled_mask)

            output_rois = tf.identity(rois, name='output_rois')

            rpn_class_loss = layer.rpn_loss(self.input_rpn_match,
                                            rpn_class_logits)
            rpn_bbox_loss = layer.rpn_bbox_loss(self.input_rpn_boxes,
                                                self.input_rpn_match, rpn_bbox)
            class_loss = layer.mrcnn_class_loss(target_class_ids,
                                                mrcnn_class_logits,
                                                active_class_ids)
            bbox_loss = layer.mrcnn_bbox_loss(target_bbox, target_class_ids,
                                              mrcnn_bbox)
            mask_loss = layer.mrcnn_mask_loss(target_mask, target_class_ids,
                                              mrcnn_mask)

            tf.summary.scalar('rpn_class_loss', rpn_class_loss)
            tf.summary.scalar('rpn_bbox_loss', rpn_bbox_loss)
            tf.summary.scalar('mrcnn_class_loss', class_loss)
            tf.summary.scalar('mrcnn_bbox_loss', bbox_loss)
            tf.summary.scalar('mrcnn_mask_loss', mask_loss)

            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
        else:
            pooled = self.pyramidRoiPooling([rpn_rois, self.input_image_meta] +
                                            mrcnn_feature_maps)
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.fpnClassifier(
                pooled)

            detections = self.objDetection(
                [rpn_rois, mrcnn_class, mrcnn_bbox, self.input_image_meta])

            detections_bbox = detections[..., :4]

            pooled = self.pyramidRoiPooling(
                [detections_bbox, self.input_image_meta] + mrcnn_feature_maps,
                pool_size=14)

            mrcnn_mask = self.fpnMask(pooled)

            outputs = [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ]

        return outputs
    def __call__(self, ipt, pool_size=None):

        if pool_size is not None:
            self.roi_size = pool_size

        boxes = ipt[0]
        image_meta = ipt[1]
        feature_maps = ipt[2:]

        y1, x1, y2, x2 = tf.split(boxes, [1, 1, 1, 1], axis=2)

        w = x2 - x1
        h = y2 - y1

        image_shape = utils.parse_image_meta_graph(image_meta)['image_shape'][0]
        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)

        roi_level = ops.log2_graph(tf.sqrt(w * h) / (224.0 / tf.sqrt(image_area)))
        roi_level = tf.minimum(5, tf.maximum(
            2, 4 + tf.cast(tf.round(roi_level), tf.int32)))

        roi_level = tf.squeeze(roi_level, 2)

        pooled = []
        box_to_level = []

        for i, level in enumerate(range(2, 6)):
            ix = tf.where(tf.equal(roi_level, level))
            level_boxes = tf.gather_nd(boxes, ix)

            # Box indices for crop_and_resize.
            box_indices = tf.cast(ix[:, 0], tf.int32)

            # Keep track of which box is mapped to which level
            box_to_level.append(ix)

            # Stop gradient propogation to ROI proposals
            level_boxes = tf.stop_gradient(level_boxes)
            box_indices = tf.stop_gradient(box_indices)

            # Crop and Resize
            # From Mask R-CNN paper: "We sample four regular locations, so
            # that we can evaluate either max or average pooling. In fact,
            # interpolating only a single value at each bin center (without
            # pooling) is nearly as effective."
            #
            # Here we use the simplified approach of a single value per bin,
            # which is how it's done in tf.crop_and_resize()
            # Result: [batch * num_boxes, pool_height, pool_width, channels]
            pooled.append(tf.image.crop_and_resize(
                feature_maps[i], level_boxes, box_indices, [self.roi_size, self.roi_size],
                method="bilinear"))

        pooled = tf.concat(pooled, axis=0)

        box_to_level = tf.concat(box_to_level, axis=0)
        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
                                 axis=1)

        sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
            box_to_level)[0]).indices[::-1]
        ix = tf.gather(box_to_level[:, 2], ix)
        pooled = tf.gather(pooled, ix)

        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
        pooled = tf.reshape(pooled, shape)
        return pooled