Пример #1
0
    def __init__(self):

        self.anchor_utils = AnchorUtils()
        self.bbox_utils = BboxUtil()
        self.image_utils = ImageUtils()

        # 日志保存路径
        self.log_path = self.log_file_path(cfg.TRAIN.LOGS_PATH,
                                           cfg.TRAIN.DATA_SOURCE)
        # 模型保存路径
        self.model_save_path = cfg.TRAIN.SAVE_MODEL_PATH

        # 训练数据
        self.train_data = CocoDataset(cfg.TRAIN.COCO_TRAIN_ANN_PATH,
                                      cfg.TRAIN.COCO_TRAIN_IMAGE_PATH)
        # 验证数据
        self.val_data = CocoDataset(cfg.TRAIN.COCO_VAL_ANN_PATH,
                                    cfg.TRAIN.COCO_VAL_IMAGE_PATH)

        # 加载 mask 网络模型
        self.mask_model = MaskRCNN(train_flag=True)
        # 使用 原作者 1 + 80 类别的数据
        # self.mask_model.load_weights(cfg.TEST.COCO_MODEL_PATH, by_name=True)

        # 载入在MS COCO上的预训练模型, 跳过不一样的分类数目层
        self.mask_model.load_weights(cfg.TRAIN.MODEL_PATH,
                                     by_name=True,
                                     exclude=[
                                         "mrcnn_class_logits", "mrcnn_bbox_fc",
                                         "mrcnn_bbox", "mrcnn_mask"
                                     ])

        self.epoch = 0
        pass
Пример #2
0
 def __init__(self, batch_size, **kwargs):
     super(DetectionLayer, self).__init__(**kwargs)
     self.batch_size = batch_size
     self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES
     self.image_utils = ImageUtils()
     self.bbox_utils = BboxUtil()
     self.misc_utils = MiscUtils()
Пример #3
0
    def __init__(self):
        self.misc_utils = MiscUtils()
        self.bbox_utils = BboxUtil()

        # Cache anchors and reuse if image shape is the same
        self._anchor_cache = {}
        # self.anchors = None
        pass
Пример #4
0
    def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs):
        super(ProposalLayer, self).__init__(**kwargs)

        self.proposal_count = proposal_count
        self.nms_threshold = nms_threshold
        self.batch_size = batch_size

        self.misc_utils = MiscUtils()
        self.bbox_utils = BboxUtil()

        pass
Пример #5
0
    def __init__(self, train_flag=True):
        """
        :param train_flag: 是否为训练,训练为 True,测试为 False
        """
        self.train_flag = train_flag
        self.bbox_util = BboxUtil()
        self.anchor_utils = AnchorUtils()
        self.image_utils = ImageUtils()
        self.mask_util = MaskUtil()

        # 模型 路径
        self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH
        # batch size
        self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE
        # 模型保存路径
        self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH

        self.backbone = cfg.COMMON.BACKBONE
        self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES
        # 输入图像
        self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE)
        # 用于构建特征金字塔的自顶向下层的大小
        self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE

        self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE
        self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS
        self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD

        self.class_num = cfg.COMMON.CLASS_NUM

        self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE
        self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO

        self.keras_model = self.build()

        pass
Пример #6
0
class DetectionLayer(KE.Layer):
    """
        Takes classified proposal boxes and their bounding box deltas and
        returns the final detection boxes.
        Returns:
            [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
            coordinates are normalized.
    """
    def __init__(self, batch_size, **kwargs):
        super(DetectionLayer, self).__init__(**kwargs)
        self.batch_size = batch_size
        self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES
        self.image_utils = ImageUtils()
        self.bbox_utils = BboxUtil()
        self.misc_utils = MiscUtils()

    def call(self, inputs):
        rois = inputs[0]
        mrcnn_class = inputs[1]
        mrcnn_bbox = inputs[2]
        image_meta = inputs[3]

        # Get windows of images in normalized coordinates. Windows are the area
        # in the image that excludes the padding.
        # Use the shape of the first image in the batch to normalize the window
        # because we know that all images get resized to the same size.
        m = self.image_utils.parse_image_meta_graph(image_meta)
        image_shape = m['image_shape'][0]
        window = self.bbox_utils.norm_boxes_graph(m['window'], image_shape[:2])

        # Run detection refinement graph on each item in the batch
        detections_batch = self.misc_utils.batch_slice(
            [rois, mrcnn_class, mrcnn_bbox, window],
            lambda x, y, w, z: refine_detections_graph(x, y, w, z),
            self.batch_size)

        # Reshape output
        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
        # normalized coordinates
        return tf.reshape(detections_batch,
                          [self.batch_size, self.detection_max_instances, 6])

    def compute_output_shape(self, input_shape):
        return (None, self.detection_max_instances, 6)
Пример #7
0
class AnchorUtils(object):
    def __init__(self):
        self.misc_utils = MiscUtils()
        self.bbox_utils = BboxUtil()

        # Cache anchors and reuse if image shape is the same
        self._anchor_cache = {}
        # self.anchors = None
        pass

    def get_anchors(self, image_shape):
        """
        :return: Returns anchor pyramid for the given image size
        """
        if tuple(image_shape) not in self._anchor_cache:
            # Generate Anchors
            anchor = self.generate_pyramid_anchors(image_shape)

            # Keep a copy of the latest anchors in pixel coordinates because
            # it's used in inspect_model notebooks.
            # TODO: Remove this after the notebook are refactored to not use it
            # self.anchors = anchor

            self._anchor_cache[tuple(
                image_shape)] = self.bbox_utils.norm_boxes(
                    anchor, image_shape[:2])
            pass

        return self._anchor_cache[tuple(image_shape)]
        pass

    def generate_pyramid_anchors(self, image_shape):
        """
            Generate anchors at different levels of a feature pyramid.
            Each scale is associated with a level of the pyramid,
            but each ratio is used in all levels of the pyramid.
        :param image_shape: [h, w, c]
        :return: anchors: [N, (y1, x1, y2, x2)]
            All generated anchors in one array.
            Sorted with the same order of the given scales.
            So, anchors of scale[0] come first, then anchors of scale[1], and so on.
        """

        backbone_strides = cfg.COMMON.BACKBONE_STRIDES
        # [N, (height, width)]. Where N is the number of stages
        backbone_shape = self.misc_utils.compute_backbone_shapes(
            image_shape, backbone_strides)

        # Anchors
        # [anchor_count, (y1, x1, y2, x2)]
        anchors = []
        scales = cfg.COMMON.RPN_ANCHOR_SCALES
        scales_len = len(scales)

        for i in range(scales_len):
            anchor_box = self.generate_anchors(scales[i], backbone_shape[i],
                                               backbone_strides[i])
            anchors.append(anchor_box)
            pass

        return np.concatenate(anchors, axis=0)
        pass

    # generate anchor box
    def generate_anchors(self, scales, backbone_shape, backbone_strides):
        """
        :param scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
        :param backbone_shape: [height, width] spatial shape of the feature map over which to generate anchors.
        :param backbone_strides: Stride of the feature map relative to the image in pixels.
        :return: anchor box: Convert to corner coordinates (y1, x1, y2, x2)
        """
        # 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
        ratios = cfg.COMMON.RPN_ANCHOR_RATIOS

        # Stride of anchors on the feature map. For example,
        # if the value is 2 then generate anchors for every other feature map pixel.
        anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE

        # Get all combinations of scales and ratios
        scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
        scales = scales.flatten()
        ratios = ratios.flatten()

        # Enumerate heights and widths from scales and ratios
        heights = scales / np.sqrt(ratios)
        widths = scales * np.sqrt(ratios)

        # Enumerate shifts in feature space
        shifts_y = np.arange(0, backbone_shape[0],
                             anchor_stride) * backbone_strides
        shifts_x = np.arange(0, backbone_shape[1],
                             anchor_stride) * backbone_strides
        shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)

        # Enumerate combinations of shifts, widths, and heights
        box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
        box_heights, box_centers_y = np.meshgrid(heights, shifts_y)

        # Reshape to get a list of (y, x) and a list of (h, w)
        box_centers = np.stack([box_centers_y, box_centers_x],
                               axis=2).reshape([-1, 2])
        box_sizes = np.stack([box_heights, box_widths],
                             axis=2).reshape([-1, 2])

        # Convert to corner coordinates (y1, x1, y2, x2)
        boxes = np.concatenate(
            [box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes],
            axis=1)
        return boxes
        pass
Пример #8
0
class MaskTrain(object):
    def __init__(self):

        self.anchor_utils = AnchorUtils()
        self.bbox_utils = BboxUtil()
        self.image_utils = ImageUtils()

        # 日志保存路径
        self.log_path = self.log_file_path(cfg.TRAIN.LOGS_PATH,
                                           cfg.TRAIN.DATA_SOURCE)
        # 模型保存路径
        self.model_save_path = cfg.TRAIN.SAVE_MODEL_PATH

        # 训练数据
        self.train_data = CocoDataset(cfg.TRAIN.COCO_TRAIN_ANN_PATH,
                                      cfg.TRAIN.COCO_TRAIN_IMAGE_PATH)
        # 验证数据
        self.val_data = CocoDataset(cfg.TRAIN.COCO_VAL_ANN_PATH,
                                    cfg.TRAIN.COCO_VAL_IMAGE_PATH)

        # 加载 mask 网络模型
        self.mask_model = MaskRCNN(train_flag=True)
        # 使用 原作者 1 + 80 类别的数据
        # self.mask_model.load_weights(cfg.TEST.COCO_MODEL_PATH, by_name=True)

        # 载入在MS COCO上的预训练模型, 跳过不一样的分类数目层
        self.mask_model.load_weights(cfg.TRAIN.MODEL_PATH,
                                     by_name=True,
                                     exclude=[
                                         "mrcnn_class_logits", "mrcnn_bbox_fc",
                                         "mrcnn_bbox", "mrcnn_mask"
                                     ])

        self.epoch = 0
        pass

    # 设置 日志文件夹
    def log_file_path(self, log_dir, data_source="coco"):
        log_start_time = datetime.now()
        log_file_name = "{}_{:%Y%m%dT%H%M}".format(data_source.lower(),
                                                   log_start_time)
        log_path = os.path.join(log_dir, log_file_name)

        return log_path
        pass

    def do_mask_train(self):
        # image augmentation
        augmentation = imgaug.augmenters.Fliplr(0.5)

        print("training - stage 1")
        # training - stage 1
        self.train_details(self.train_data,
                           self.val_data,
                           learning_rate=cfg.TRAIN.ROUGH_LEARNING_RATE,
                           epochs=cfg.TRAIN.FIRST_STAGE_N_EPOCH,
                           layers=cfg.TRAIN.HEADS_LAYERS,
                           augmentation=augmentation)

        print("training - stage 2")
        # training - stage 2
        self.train_details(self.train_data,
                           self.val_data,
                           learning_rate=cfg.TRAIN.ROUGH_LEARNING_RATE,
                           epochs=cfg.TRAIN.MIDDLE_STAGE_N_EPOCH,
                           layers=cfg.TRAIN.FOUR_MORE_LAYERS,
                           augmentation=augmentation)

        print("training - stage 3")
        # training - stage 3
        self.train_details(self.train_data,
                           self.val_data,
                           learning_rate=cfg.TRAIN.FINE_LEARNING_RATE,
                           epochs=cfg.TRAIN.LAST_STAGE_N_EPOCH,
                           layers=cfg.TRAIN.ALL_LAYERS,
                           augmentation=augmentation)
        pass

    def train_details(self,
                      train_data,
                      val_data,
                      learning_rate,
                      epochs,
                      layers,
                      augmentation=None,
                      custom_callbacks=None,
                      no_augmentation_sources=None):
        """
            Train the model.
        :param train_data: Training data object
        :param val_data: val data object
        :param learning_rate: The learning rate to train with
        :param epochs: Number of training epochs. Note that previous training epochs
                        are considered to be done alreay, so this actually determines
                        the epochs to train in total rather than in this particaular
                        call.
        :param layers: Allows selecting wich layers to train. It can be:
                        - A regular expression to match layer names to train
                        - One of these predefined values:
                          heads: The RPN, classifier and mask heads of the network
                          all: All the layers
                          3+: Train Resnet stage 3 and up
                          4+: Train Resnet stage 4 and up
                          5+: Train Resnet stage 5 and up
        :param augmentation: Optional. An imgaug (https://github.com/aleju/imgaug)
                            augmentation. For example, passing imgaug.augmenters.Fliplr(0.5)
                            flips images right/left 50% of the time. You can pass complex
                            augmentations as well. This augmentation applies 50% of the
                            time, and when it does it flips images right/left half the time
                            and adds a Gaussian blur with a random sigma in range 0 to 5.
        :param custom_callbacks: Optional. Add custom callbacks to be called
                                with the keras fit_generator method. Must be list of type keras.callbacks.
        :param no_augmentation_sources: Optional. List of sources to exclude for
                                        augmentation. A source is string that identifies a dataset and is
                                        defined in the Dataset class.
        :return:
        """

        # Pre-defined layer regular expressions
        layer_regex = cfg.TRAIN.LAYER_REGEX

        if layers in layer_regex:
            layers = layer_regex[layers]
            pass
        self.set_trainable(layers)

        if not os.path.exists(self.log_path):
            os.makedirs(self.log_path)
            pass

        # Callbacks
        callbacks = [
            keras.callbacks.TensorBoard(log_dir=self.log_path,
                                        histogram_freq=0,
                                        write_graph=True,
                                        write_images=False),
            keras.callbacks.ModelCheckpoint(self.model_save_path,
                                            verbose=0,
                                            save_weights_only=True)
        ]

        # Add custom callbacks to the list
        if custom_callbacks:
            callbacks += custom_callbacks
            pass

        # Data generators
        train_generator = self.data_generator(
            train_data,
            augmentation=augmentation,
            batch_size=self.mask_model.batch_size,
            no_augmentation_sources=no_augmentation_sources)
        val_generator = self.data_generator(
            val_data, batch_size=self.mask_model.batch_size)

        self.compile(learning_rate, cfg.TRAIN.LEARNING_MOMENTUM)

        print("learning_rate: {}, checkpoint path: {}".format(
            learning_rate, self.model_save_path))

        # Work-around for Windows: Keras fails on Windows when using
        # multiprocessing workers. See discussion here:
        # https://github.com/matterport/Mask_RCNN/issues/13#issuecomment-353124009
        if os.name is 'nt':
            workers = 0
            pass
        else:
            workers = multiprocessing.cpu_count()
            pass

        self.mask_model.keras_model.fit_generator(
            generator=train_generator,
            initial_epoch=self.epoch,
            epochs=epochs,
            steps_per_epoch=cfg.TRAIN.STEPS_PER_EPOCH,
            callbacks=callbacks,
            validation_data=val_generator,
            validation_steps=cfg.TRAIN.VALIDATION_STEPS,
            max_queue_size=100,
            workers=workers,
            use_multiprocessing=True,
        )

        self.epoch = max(self.epoch, epochs)
        pass

    def data_generator(self,
                       data,
                       augmentation=None,
                       batch_size=1,
                       random_rois=0,
                       detection_targets=False,
                       no_augmentation_sources=None):
        """
            A generator that returns images and corresponding target class ids,
            bounding box deltas, and masks.
        :param data: The Dataset object to pick data from
        :param augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
                            For example, passing imgaug.augmenters.Fliplr(0.5) flips images
                            right/left 50% of the time.
        :param batch_size: How many images to return in each call
        :param random_rois: If > 0 then generate proposals to be used to train the
                             network classifier and mask heads. Useful if training
                             the Mask RCNN part without the RPN.
        :param detection_targets: If True, generate detection targets (class IDs, bbox
                                deltas, and masks). Typically for debugging or visualizations because
                                in trainig detection targets are generated by DetectionTargetLayer.
        :param no_augmentation_sources: Optional. List of sources to exclude for
                                        augmentation. A source is string that identifies a dataset and is
                                        defined in the Dataset class.
        :return: Returns a Python generator. Upon calling next() on it, the
                generator returns two lists, inputs and outputs. The contents
                of the lists differs depending on the received arguments:
            inputs list:
                - images: [batch, H, W, C]
                - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
                - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
                - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
                - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
                - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
                - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
                            are those of the image unless use_mini_mask is True, in which
                            case they are defined in MINI_MASK_SHAPE.

            outputs list: Usually empty in regular training. But if detection_targets
                        is True then the outputs list contains target class_ids, bbox deltas,
                        and masks.
        """
        # batch item index
        batch_index = 0
        image_index = -1
        image_ids = np.copy(data.image_ids_list)
        error_count = 0
        no_augmentation_sources = no_augmentation_sources or []

        # Anchors
        # [anchor_count, (y1, x1, y2, x2)]
        # Generate Anchors
        anchors = self.anchor_utils.generate_pyramid_anchors(
            image_shape=cfg.COMMON.IMAGE_SHAPE)

        image_id = ""

        mini_mask = cfg.TRAIN.USE_MINI_MASK
        max_gt_instances = cfg.TRAIN.MAX_GT_INSTANCES
        mean_pixel = np.array(cfg.COMMON.MEAN_PIXEL)

        # Keras requires a generator to run indefinitely.
        while True:
            try:
                # Increment index to pick next image. Shuffle if at the start of an epoch.
                image_index = (image_index + 1) % len(image_ids)
                if image_index == 0:
                    np.random.shuffle(image_ids)

                # Get GT bounding boxes and masks for image.
                image_id = image_ids[image_index]

                # If the image source is not to be augmented pass None as augmentation
                if data.image_info_list[image_id][
                        'source'] in no_augmentation_sources:

                    image, image_meta, gt_class_ids, gt_boxes, gt_masks = self.bbox_utils.load_image_gt(
                        data, image_id, None, mini_mask)
                else:
                    image, image_meta, gt_class_ids, gt_boxes, gt_masks = self.bbox_utils.load_image_gt(
                        data, image_id, augmentation, mini_mask)

                # Skip images that have no instances. This can happen in cases
                # where we train on a subset of classes and the image doesn't
                # have any of the classes we care about.
                if not np.any(gt_class_ids > 0):
                    continue
                    pass

                # RPN Targets
                rpn_match, rpn_bbox = common.build_rpn_targets(
                    anchors, gt_class_ids, gt_boxes)

                # 在这里定义 变量,避免下面使用的时候出现未定义
                rpn_rois = None
                rois = None
                mrcnn_class_ids = None
                mrcnn_bbox = None
                mrcnn_mask = None

                # Mask R-CNN Targets
                if random_rois:
                    rpn_rois = self.mask_model.generate_random_rois(
                        image.shape, random_rois, gt_boxes)

                    if detection_targets:
                        rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = \
                            self.mask_model.build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks)
                        pass
                    pass

                # Init batch arrays
                if batch_index == 0:
                    batch_image_meta = np.zeros(
                        (batch_size, ) + image_meta.shape,
                        dtype=image_meta.dtype)
                    batch_rpn_match = np.zeros(
                        [batch_size, anchors.shape[0], 1],
                        dtype=rpn_match.dtype)
                    batch_rpn_bbox = np.zeros(
                        [batch_size, cfg.TRAIN.ANCHORS_PER_IMAGE, 4],
                        dtype=rpn_bbox.dtype)
                    batch_images = np.zeros((batch_size, ) + image.shape,
                                            dtype=np.float32)
                    batch_gt_class_ids = np.zeros(
                        (batch_size, max_gt_instances), dtype=np.int32)
                    batch_gt_boxes = np.zeros(
                        (batch_size, max_gt_instances, 4), dtype=np.int32)
                    batch_gt_masks = np.zeros(
                        (batch_size, gt_masks.shape[0], gt_masks.shape[1],
                         max_gt_instances),
                        dtype=gt_masks.dtype)

                    if random_rois:
                        batch_rpn_rois = np.zeros(
                            (batch_size, rpn_rois.shape[0], 4),
                            dtype=rpn_rois.dtype)

                        if detection_targets:
                            batch_rois = np.zeros((batch_size, ) + rois.shape,
                                                  dtype=rois.dtype)
                            batch_mrcnn_class_ids = np.zeros(
                                (batch_size, ) + mrcnn_class_ids.shape,
                                dtype=mrcnn_class_ids.dtype)
                            batch_mrcnn_bbox = np.zeros(
                                (batch_size, ) + mrcnn_bbox.shape,
                                dtype=mrcnn_bbox.dtype)
                            batch_mrcnn_mask = np.zeros(
                                (batch_size, ) + mrcnn_mask.shape,
                                dtype=mrcnn_mask.dtype)
                            pass
                        pass
                    pass

                # If more instances than fits in the array, sub-sample from them.
                if gt_boxes.shape[0] > max_gt_instances:
                    ids = np.random.choice(np.arange(gt_boxes.shape[0]),
                                           max_gt_instances,
                                           replace=False)
                    gt_class_ids = gt_class_ids[ids]
                    gt_boxes = gt_boxes[ids]
                    gt_masks = gt_masks[:, :, ids]

                # Add to batch
                batch_image_meta[batch_index] = image_meta
                batch_rpn_match[batch_index] = rpn_match[:, np.newaxis]
                batch_rpn_bbox[batch_index] = rpn_bbox
                batch_images[batch_index] = self.image_utils.mold_image(
                    image.astype(np.float32), mean_pixel)
                batch_gt_class_ids[
                    batch_index, :gt_class_ids.shape[0]] = gt_class_ids
                batch_gt_boxes[batch_index, :gt_boxes.shape[0]] = gt_boxes
                batch_gt_masks[
                    batch_index, :, :, :gt_masks.shape[-1]] = gt_masks

                if random_rois:
                    batch_rpn_rois[batch_index] = rpn_rois
                    if detection_targets:
                        batch_rois[batch_index] = rois
                        batch_mrcnn_class_ids[batch_index] = mrcnn_class_ids
                        batch_mrcnn_bbox[batch_index] = mrcnn_bbox
                        batch_mrcnn_mask[batch_index] = mrcnn_mask
                        pass
                    pass
                batch_index += 1

                # Batch full?
                if batch_index >= batch_size:
                    inputs = [
                        batch_images, batch_image_meta, batch_rpn_match,
                        batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes,
                        batch_gt_masks
                    ]
                    outputs = []

                    if random_rois:
                        inputs.extend([batch_rpn_rois])
                        if detection_targets:
                            inputs.extend([batch_rois])
                            # Keras requires that output and targets have the same number of dimensions
                            batch_mrcnn_class_ids = np.expand_dims(
                                batch_mrcnn_class_ids, -1)
                            outputs.extend([
                                batch_mrcnn_class_ids, batch_mrcnn_bbox,
                                batch_mrcnn_mask
                            ])

                    yield inputs, outputs

                    # start a new batch
                    batch_index = 0
                pass
            except (GeneratorExit, KeyboardInterrupt):
                raise
            except:
                # Log it and skip the image
                logging.exception("Error processing image {}".format(
                    data.image_info_list[image_id]))
                error_count += 1
                if error_count > 5:
                    raise

            pass
        pass

    def set_trainable(self, layer_regex, mask_model=None, indent=0, verbose=1):
        """
            Sets model layers as trainable if their names match
            the given regular expression.
        :param layer_regex:
        :param mask_model:
        :param indent:
        :param verbose:
        :return:
        """

        # Print message on the first call (but not on recursive calls)
        if verbose > 0 and mask_model is None:
            print("Selecting layers to train")
            pass

        mask_model = mask_model or self.mask_model.keras_model

        # In multi-GPU training, we wrap the model. Get layers
        # of the inner model because they have the weights.
        layers = mask_model.inner_model.layers if hasattr(
            mask_model, "inner_model") else mask_model.layers

        for layer in layers:
            # Is the layer a model?
            if layer.__class__.__name__ == 'Model':
                print("In model: ", layer.name)
                self.set_trainable(layer_regex,
                                   mask_model=layer,
                                   indent=indent + 4)
                continue

            if not layer.weights:
                continue
            # Is it trainable?
            trainable = bool(re.fullmatch(layer_regex, layer.name))

            # Update layer. If layer is a container, update inner layer.
            if layer.__class__.__name__ == 'TimeDistributed':
                layer.layer.trainable = trainable
            else:
                layer.trainable = trainable

            # Print trainable layer names
            if trainable and verbose > 0:
                print("{}{:20}   ({})".format(" " * indent, layer.name,
                                              layer.__class__.__name__))
        pass

    def compile(self, learning_rate, momentum_param):
        """
            Gets the model ready for training. Adds losses, regularization, and
            metrics. Then calls the Keras compile() function.
        :param learning_rate:
        :param momentum_param:
        :return:
        """

        # Optimizer object
        optimizer = keras.optimizers.SGD(lr=learning_rate,
                                         momentum=momentum_param,
                                         clipnorm=cfg.TRAIN.GRADIENT_CLIP_NORM)

        self.mask_model.keras_model._losses = []
        self.mask_model.keras_model._per_input_losses = {}

        loss_names = [
            "rpn_class_loss", "rpn_bbox_loss", "mrcnn_class_loss",
            "mrcnn_bbox_loss", "mrcnn_mask_loss"
        ]

        for name in loss_names:
            layer = self.mask_model.keras_model.get_layer(name)
            if layer.output in self.mask_model.keras_model.losses:
                continue
            loss = (tf.reduce_mean(layer.output, keepdims=True) *
                    cfg.COMMON.LOSS_WEIGHTS.get(name, 1.))
            self.mask_model.keras_model.add_loss(loss)
            pass

        # Add L2 Regularization
        # Skip gamma and beta weights of batch normalization layers.
        reg_losses = [
            keras.regularizers.l2(cfg.TRAIN.WEIGHT_DECAY)(w) /
            tf.cast(tf.size(w), tf.float32)
            for w in self.mask_model.keras_model.trainable_weights
            if 'gamma' not in w.name and 'beta' not in w.name
        ]

        self.mask_model.keras_model.add_loss(tf.add_n(reg_losses))

        # Compile
        self.mask_model.keras_model.compile(
            optimizer=optimizer,
            loss=[None] * len(self.mask_model.keras_model.outputs))

        # Add metrics for losses
        for name in loss_names:
            if name in self.mask_model.keras_model.metrics_names:
                continue
                pass
            layer = self.mask_model.keras_model.get_layer(name)

            self.mask_model.keras_model.metrics_names.append(name)

            loss = (tf.reduce_mean(layer.output, keepdims=True) *
                    cfg.COMMON.LOSS_WEIGHTS.get(name, 1.))

            self.mask_model.keras_model.metrics_tensors.append(loss)
        pass
Пример #9
0
 def __init__(self):
     self.bbox_util = BboxUtil()
     pass
Пример #10
0
class MiscUtils(object):
    def __init__(self):
        self.bbox_util = BboxUtil()
        pass

    def compute_backbone_shapes(self, image_shape, backbone_strides):
        """
            Computes the width and height of each stage of the backbone network
        :param image_shape: [h, w, c]
        :param backbone_strides: The strides of each layer of the FPN Pyramid.
                                These values are based on a resNet101 backbone.
        :return: [N, (height, width)]. Where N is the number of stages
        """
        return np.array([[
            int(math.ceil(image_shape[0] / stride)),
            int(math.ceil(image_shape[1] / stride))
        ] for stride in backbone_strides])
        pass

    def batch_slice(self, inputs, graph_fn, batch_size, names=None):
        """
            Splits inputs into slices and feeds each slice to a copy of the given
            computation graph and then combines the results. It allows you to run a
            graph on a batch of inputs even if the graph is written to support one
            instance only.
        :param inputs: list of tensors. All must have the same first dimension length
        :param graph_fn: A function that returns a TF tensor that's part of a graph.
        :param batch_size: number of slices to divide the data into.
        :param names: If provided, assigns names to the resulting tensors.
        :return:
        """

        if not isinstance(inputs, list):
            inputs = [inputs]

        outputs = []
        for i in range(batch_size):
            inputs_slice = [x[i] for x in inputs]
            output_slice = graph_fn(*inputs_slice)
            if not isinstance(output_slice, (tuple, list)):
                output_slice = [output_slice]
            outputs.append(output_slice)
        # Change outputs from a list of slices where each is
        # a list of outputs to a list of outputs and each has
        # a list of slices
        outputs = list(zip(*outputs))

        if names is None:
            names = [None] * len(outputs)

        result = [tf.stack(o, axis=0, name=n) for o, n in zip(outputs, names)]
        if len(result) == 1:
            result = result[0]

        return result
        pass

    def trim_zeros_graph(self, boxes, name='trim_zeros'):
        """
            Often boxes are represented with matrices of shape [N, 4] and
            are padded with zeros. This removes zero boxes.
        :param boxes: [N, 4] matrix of boxes.
        :param name:
        :return: non_zeros: [N] a 1D boolean mask identifying the rows to keep
        """

        non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
        boxes = tf.boolean_mask(boxes, non_zeros, name=name)
        return boxes, non_zeros
        pass

    def detection_targets_graph(self, proposals, gt_class_ids, gt_boxes,
                                gt_masks):
        """
            Generates detection targets for one image. Subsamples proposals and
            generates target class IDs, bounding box deltas, and masks for each.
        :param proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates.
                          Might be zero padded if there are not enough proposals.
        :param gt_class_ids: [MAX_GT_INSTANCES] int class IDs
        :param gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
        :param gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
        :return: Target ROIs and corresponding class IDs, bounding box shifts, and masks.
            rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
            class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
            deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))]
            masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox
                   boundaries and resized to neural network output size.

            Note: Returned arrays might be zero padded if not enough target ROIs.
        """

        # Assertions
        asserts = [
            tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
                      name="roi_assertion"),
        ]

        with tf.control_dependencies(asserts):
            proposals = tf.identity(proposals)
            pass

        # Remove zero padding
        proposals, _ = self.trim_zeros_graph(proposals, name="trim_proposals")
        gt_boxes, non_zeros = self.trim_zeros_graph(gt_boxes,
                                                    name="trim_gt_boxes")
        gt_class_ids = tf.boolean_mask(gt_class_ids,
                                       non_zeros,
                                       name="trim_gt_class_ids")
        gt_masks = tf.gather(gt_masks,
                             tf.where(non_zeros)[:, 0],
                             axis=2,
                             name="trim_gt_masks")

        # Handle COCO crowds
        # A crowd box in COCO is a bounding box around several instances. Exclude
        # them from training. A crowd box is given a negative class ID.
        crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
        non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
        crowd_boxes = tf.gather(gt_boxes, crowd_ix)
        gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
        gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
        gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)

        # Compute overlaps matrix [proposals, gt_boxes]
        overlaps = self.bbox_util.overlaps_graph(proposals, gt_boxes)

        # Compute overlaps with crowd boxes [proposals, crowd_boxes]
        crowd_overlaps = self.bbox_util.overlaps_graph(proposals, crowd_boxes)
        crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
        no_crowd_bool = (crowd_iou_max < 0.001)

        # Determine positive and negative ROIs
        roi_iou_max = tf.reduce_max(overlaps, axis=1)
        # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
        positive_roi_bool = (roi_iou_max >= 0.5)
        positive_indices = tf.where(positive_roi_bool)[:, 0]
        # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
        negative_indices = tf.where(
            tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]

        # Subsample ROIs. Aim for 33% positive
        # Positive ROIs
        positive_count = int(cfg.TRAIN.ROIS_PER_IMAGE *
                             cfg.TRAIN.ROI_POSITIVE_RATIO)
        positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
        positive_count = tf.shape(positive_indices)[0]
        # Negative ROIs. Add enough to maintain positive:negative ratio.
        r = 1.0 / cfg.TRAIN.ROI_POSITIVE_RATIO
        negative_count = tf.cast(r * tf.cast(positive_count, tf.float32),
                                 tf.int32) - positive_count
        negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
        # Gather selected ROIs
        positive_rois = tf.gather(proposals, positive_indices)
        negative_rois = tf.gather(proposals, negative_indices)

        # Assign positive ROIs to GT boxes.
        positive_overlaps = tf.gather(overlaps, positive_indices)
        roi_gt_box_assignment = tf.cond(
            tf.greater(tf.shape(positive_overlaps)[1], 0),
            true_fn=lambda: tf.argmax(positive_overlaps, axis=1),
            false_fn=lambda: tf.cast(tf.constant([]), tf.int64))
        roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
        roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)

        # Compute bbox refinement for positive ROIs
        deltas = self.bbox_util.box_refinement_graph(positive_rois,
                                                     roi_gt_boxes)
        deltas /= np.array(cfg.COMMON.BBOX_STD_DEV)

        # Assign positive ROIs to GT masks
        # Permute masks to [N, height, width, 1]
        transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]),
                                          -1)
        # Pick the right mask for each ROI
        roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)

        # Compute mask targets
        boxes = positive_rois
        if cfg.TRAIN.USE_MINI_MASK:
            # Transform ROI coordinates from normalized image space
            # to normalized mini-mask space.
            y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
            gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
            gt_h = gt_y2 - gt_y1
            gt_w = gt_x2 - gt_x1
            y1 = (y1 - gt_y1) / gt_h
            x1 = (x1 - gt_x1) / gt_w
            y2 = (y2 - gt_y1) / gt_h
            x2 = (x2 - gt_x1) / gt_w
            boxes = tf.concat([y1, x1, y2, x2], 1)
        box_ids = tf.range(0, tf.shape(roi_masks)[0])
        masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes,
                                         box_ids, cfg.TRAIN.MASK_SHAPE)
        # Remove the extra dimension from masks.
        masks = tf.squeeze(masks, axis=3)

        # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
        # binary cross entropy loss.
        masks = tf.round(masks)

        # Append negative ROIs and pad bbox deltas and masks that
        # are not used for negative ROIs with zeros.
        rois = tf.concat([positive_rois, negative_rois], axis=0)
        N = tf.shape(negative_rois)[0]
        P = tf.maximum(cfg.TRAIN.ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
        rois = tf.pad(rois, [(0, P), (0, 0)])
        # roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
        roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
        deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
        masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])

        return rois, roi_gt_class_ids, deltas, masks
        pass
Пример #11
0
class ProposalLayer(KE.Layer):
    """
        Receives anchor scores and selects a subset to pass as proposals
        to the second stage. Filtering is done based on anchor scores and
        non-max suppression to remove overlaps. It also applies bounding
        box refinement deltas to anchors.

        Inputs:
            rpn_probs: [batch, num_anchors, (bg prob, fg prob)]
            rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]
            anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates

        Returns:
            Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
    """
    def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs):
        super(ProposalLayer, self).__init__(**kwargs)

        self.proposal_count = proposal_count
        self.nms_threshold = nms_threshold
        self.batch_size = batch_size

        self.misc_utils = MiscUtils()
        self.bbox_utils = BboxUtil()

        pass

    def call(self, inputs):
        """
            这里的 call 方法,会被 __init__() 方法回调
        :param inputs:
        :return:
        """
        # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
        scores = inputs[0][:, :, 1]
        # Box deltas [batch, num_rois, 4]
        deltas = inputs[1]
        rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV)
        deltas = deltas * np.reshape(rpn_bbox_std_dev, [1, 1, 4])
        # Anchors
        anchors = inputs[2]

        # Improve performance by trimming to top anchors by score
        # and doing the rest on the smaller subset.
        pre_nms_limit = tf.minimum(cfg.COMMON.PRE_NMS_LIMIT,
                                   tf.shape(anchors)[1])
        ix = tf.nn.top_k(scores,
                         pre_nms_limit,
                         sorted=True,
                         name="top_anchors").indices

        scores = self.misc_utils.batch_slice([scores, ix],
                                             lambda x, y: tf.gather(x, y),
                                             self.batch_size)
        deltas = self.misc_utils.batch_slice([deltas, ix],
                                             lambda x, y: tf.gather(x, y),
                                             self.batch_size)
        pre_nms_anchors = self.misc_utils.batch_slice(
            [anchors, ix],
            lambda a, x: tf.gather(a, x),
            self.batch_size,
            names=["pre_nms_anchors"])

        # Apply deltas to anchors to get refined anchors.
        # [batch, N, (y1, x1, y2, x2)]
        boxes = self.misc_utils.batch_slice(
            [pre_nms_anchors, deltas],
            lambda x, y: self.bbox_utils.apply_box_deltas_graph(x, y),
            self.batch_size,
            names=["refined_anchors"])

        # Clip to image boundaries. Since we're in normalized coordinates,
        # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
        window = np.array([0, 0, 1, 1], dtype=np.float32)
        boxes = self.misc_utils.batch_slice(
            boxes,
            lambda x: self.bbox_utils.clip_boxes_graph(x, window),
            self.batch_size,
            names=["refined_anchors_clipped"])

        # Filter out small boxes
        # According to Xinlei Chen's paper, this reduces detection accuracy
        # for small objects, so we're skipping it.

        # Non-max suppression
        def nms(boxes, scores):
            indices = tf.image.non_max_suppression(
                boxes,
                scores,
                self.proposal_count,
                self.nms_threshold,
                name="rpn_non_max_suppression")
            proposals = tf.gather(boxes, indices)
            # Pad if needed
            padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0],
                                 0)
            proposals = tf.pad(proposals, [(0, padding), (0, 0)])
            return proposals

        proposals = self.misc_utils.batch_slice([boxes, scores], nms,
                                                self.batch_size)

        return proposals

    def compute_output_shape(self, input_shape):
        return (None, self.proposal_count, 4)
Пример #12
0
def refine_detections_graph(rois, probs, deltas, window):
    """
        Refine classified proposals and filter overlaps and return final detections.
    :param rois: [N, (y1, x1, y2, x2)] in normalized coordinates
    :param probs: [N, num_classes]. Class probabilities.
    :param deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific bounding box deltas.
    :param window: (y1, x1, y2, x2) in normalized coordinates.
                   The part of the image that contains the image excluding the padding.
    :return: Returns detections shaped:
            [num_detections, (y1, x1, y2, x2, class_id, score)] where coordinates are normalized.
    """
    # Class IDs per ROI
    class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
    # Class probability of the top class of each ROI
    indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
    class_scores = tf.gather_nd(probs, indices)
    # Class-specific bounding box deltas
    deltas_specific = tf.gather_nd(deltas, indices)
    # Apply bounding box deltas
    # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
    bbox_utils = BboxUtil()
    refined_rois = bbox_utils.apply_box_deltas_graph(
        rois, deltas_specific * cfg.COMMON.BBOX_STD_DEV)
    # Clip boxes to image window
    refined_rois = bbox_utils.clip_boxes_graph(refined_rois, window)

    # TODO: Filter out boxes with zero area

    # Filter out background boxes
    keep = tf.where(class_ids > 0)[:, 0]
    # Filter out low confidence boxes
    defection_min_confidence = cfg.COMMON.DETECTION_MIN_CONFIDENCE
    if defection_min_confidence:
        conf_keep = tf.where(class_scores >= defection_min_confidence)[:, 0]
        keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
                                        tf.expand_dims(conf_keep, 0))
        keep = tf.sparse_tensor_to_dense(keep)[0]

    # Apply per-class NMS
    # 1. Prepare variables
    pre_nms_class_ids = tf.gather(class_ids, keep)
    pre_nms_scores = tf.gather(class_scores, keep)
    pre_nms_rois = tf.gather(refined_rois, keep)
    unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]

    def nms_keep_map(class_id):
        """Apply Non-Maximum Suppression on ROIs of the given class."""

        defection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES
        # Indices of ROIs of the given class
        ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
        # Apply NMS
        class_keep = tf.image.non_max_suppression(
            tf.gather(pre_nms_rois, ixs),
            tf.gather(pre_nms_scores, ixs),
            max_output_size=defection_max_instances,
            iou_threshold=cfg.TEST.DETECTION_NMS_THRESHOLD)
        # Map indices
        class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
        # Pad with -1 so returned tensors have the same shape
        gap = defection_max_instances - tf.shape(class_keep)[0]
        class_keep = tf.pad(class_keep, [(0, gap)],
                            mode='CONSTANT',
                            constant_values=-1)
        # Set shape so map_fn() can infer result shape
        class_keep.set_shape([defection_max_instances])
        return class_keep

    # 2. Map over class IDs
    nms_keep = tf.map_fn(nms_keep_map,
                         unique_pre_nms_class_ids,
                         dtype=tf.int64)
    # 3. Merge results into one list, and remove -1 padding
    nms_keep = tf.reshape(nms_keep, [-1])
    nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
    # 4. Compute intersection between keep and nms_keep
    keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
                                    tf.expand_dims(nms_keep, 0))
    keep = tf.sparse_tensor_to_dense(keep)[0]
    # Keep top detections
    roi_count = cfg.TEST.DETECTION_MAX_INSTANCES
    class_scores_keep = tf.gather(class_scores, keep)
    num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
    top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
    keep = tf.gather(keep, top_ids)

    # Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
    # Coordinates are normalized.
    detections = tf.concat([
        tf.gather(refined_rois, keep),
        tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
        tf.gather(class_scores, keep)[..., tf.newaxis]
    ],
                           axis=1)

    # Pad with zeros if detections < DETECTION_MAX_INSTANCES
    gap = cfg.TEST.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
    detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
    return detections
Пример #13
0
def build_rpn_targets(anchors, gt_class_ids, gt_boxes):
    """
        Given the anchors and GT boxes, compute overlaps and identify positive
        anchors and deltas to refine them to match their corresponding GT boxes.
    :param anchors: [num_anchors, (y1, x1, y2, x2)]
    :param gt_class_ids: [num_gt_boxes] Integer class IDs.
    :param gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
    :return:
        rpn_match: [N] (int32) matches between anchors and GT boxes.
                   1 = positive anchor, -1 = negative anchor, 0 = neutral
        rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
    """
    # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
    rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
    anchor_per_image = cfg.TRAIN.ANCHORS_PER_IMAGE
    # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
    rpn_bbox = np.zeros((anchor_per_image, 4))

    bbox_util = BboxUtil()
    # Handle COCO crowds
    # A crowd box in COCO is a bounding box around several instances. Exclude
    # them from training. A crowd box is given a negative class ID.
    crowd_ix = np.where(gt_class_ids < 0)[0]
    if crowd_ix.shape[0] > 0:
        # Filter out crowds from ground truth class IDs and boxes
        non_crowd_ix = np.where(gt_class_ids > 0)[0]
        crowd_boxes = gt_boxes[crowd_ix]
        gt_class_ids = gt_class_ids[non_crowd_ix]
        gt_boxes = gt_boxes[non_crowd_ix]
        # Compute overlaps with crowd boxes [anchors, crowds]
        crowd_overlaps = bbox_util.compute_overlaps(anchors, crowd_boxes)
        crowd_iou_max = np.amax(crowd_overlaps, axis=1)
        no_crowd_bool = (crowd_iou_max < 0.001)
        pass
    else:
        # All anchors don't intersect a crowd
        no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
        pass

    # Compute overlaps [num_anchors, num_gt_boxes]
    overlaps = bbox_util.compute_overlaps(anchors, gt_boxes)

    # Match anchors to GT Boxes
    # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
    # If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
    # Neutral anchors are those that don't match the conditions above,
    # and they don't influence the loss function.
    # However, don't keep any GT box unmatched (rare, but happens). Instead,
    # match it to the closest anchor (even if its max IoU is < 0.3).
    #
    # 1. Set negative anchors first. They get overwritten below if a GT box is
    # matched to them. Skip boxes in crowd areas.
    anchor_iou_argmax = np.argmax(overlaps, axis=1)
    anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
    rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
    # 2. Set an anchor for each GT box (regardless of IoU value).
    # If multiple anchors have the same IoU match all of them
    gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:, 0]
    rpn_match[gt_iou_argmax] = 1
    # 3. Set anchors with high overlap as positive.
    rpn_match[anchor_iou_max >= 0.7] = 1

    # Subsample to balance positive and negative anchors
    # Don't let positives be more than half the anchors
    ids = np.where(rpn_match == 1)[0]
    extra = len(ids) - (anchor_per_image // 2)
    if extra > 0:
        # Reset the extra ones to neutral
        ids = np.random.choice(ids, extra, replace=False)
        rpn_match[ids] = 0
    # Same for negative proposals
    ids = np.where(rpn_match == -1)[0]
    extra = len(ids) - (anchor_per_image - np.sum(rpn_match == 1))
    if extra > 0:
        # Rest the extra ones to neutral
        ids = np.random.choice(ids, extra, replace=False)
        rpn_match[ids] = 0
        pass

    # For positive anchors, compute shift and scale needed to transform them
    # to match the corresponding GT boxes.
    ids = np.where(rpn_match == 1)[0]
    ix = 0  # index into rpn_bbox

    # TODO: use box_refinement() rather than duplicating the code here
    for i, a in zip(ids, anchors[ids]):
        # Closest gt box (it might have IoU < 0.7)
        gt = gt_boxes[anchor_iou_argmax[i]]

        # Convert coordinates to center plus width/height.
        # GT Box
        gt_h = gt[2] - gt[0]
        gt_w = gt[3] - gt[1]
        gt_center_y = gt[0] + 0.5 * gt_h
        gt_center_x = gt[1] + 0.5 * gt_w
        # Anchor
        a_h = a[2] - a[0]
        a_w = a[3] - a[1]
        a_center_y = a[0] + 0.5 * a_h
        a_center_x = a[1] + 0.5 * a_w

        # Compute the bbox refinement that the RPN should predict.
        rpn_bbox[ix] = [
            (gt_center_y - a_center_y) / a_h,
            (gt_center_x - a_center_x) / a_w,
            np.log(gt_h / a_h),
            np.log(gt_w / a_w),
        ]
        # Normalize
        rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV)
        rpn_bbox[ix] /= rpn_bbox_std_dev
        ix += 1

    return rpn_match, rpn_bbox
    pass
Пример #14
0
class MaskRCNN(object):
    def __init__(self, train_flag=True):
        """
        :param train_flag: 是否为训练,训练为 True,测试为 False
        """
        self.train_flag = train_flag
        self.bbox_util = BboxUtil()
        self.anchor_utils = AnchorUtils()
        self.image_utils = ImageUtils()
        self.mask_util = MaskUtil()

        # 模型 路径
        self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH
        # batch size
        self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE
        # 模型保存路径
        self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH

        self.backbone = cfg.COMMON.BACKBONE
        self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES
        # 输入图像
        self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE)
        # 用于构建特征金字塔的自顶向下层的大小
        self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE

        self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE
        self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS
        self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD

        self.class_num = cfg.COMMON.CLASS_NUM

        self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE
        self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO

        self.keras_model = self.build()

        pass

    def build(self):

        # image shape
        h, w, c = self.image_shape[:]
        print("image_shape: {}".format(self.image_shape))

        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
            raise Exception(
                "Image size must be dividable by 2 at least 6 times "
                "to avoid fractions when downscaling and upscaling."
                "For example, use 256, 320, 384, 448, 512, ... etc. ")

            # Inputs
        input_image = kl.Input(shape=[None, None, c], name="input_image")
        input_image_meta = kl.Input(shape=[cfg.COMMON.IMAGE_META_SIZE],
                                    name="input_image_meta")

        # 训练
        if self.train_flag:

            # RPN GT
            input_rpn_match = kl.Input(shape=[None, 1],
                                       name="input_rpn_match",
                                       dtype=tf.int32)
            input_rpn_bbox = kl.Input(shape=[None, 4],
                                      name="input_rpn_bbox",
                                      dtype=tf.float32)

            # Detection GT (class IDs, bounding boxes, and masks)
            # 1. GT Class IDs (zero padded)
            input_gt_class_ids = kl.Input(shape=[None],
                                          name="input_gt_class_ids",
                                          dtype=tf.int32)

            # 2. GT Boxes in pixels (zero padded)
            # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
            input_gt_boxes = kl.Input(shape=[None, 4],
                                      name="input_gt_boxes",
                                      dtype=tf.float32)

            # Normalize coordinates
            gt_boxes = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph(
                x,
                k.shape(input_image)[1:3]))(input_gt_boxes)

            # 3. GT Masks (zero padded)
            # [batch, height, width, MAX_GT_INSTANCES]
            if cfg.TRAIN.USE_MINI_MASK:
                min_h, min_w = cfg.TRAIN.MINI_MASK_SHAPE[:]
                input_gt_masks = kl.Input(shape=[min_h, min_w, None],
                                          name="input_gt_masks",
                                          dtype=bool)
            else:
                input_gt_masks = kl.Input(shape=[h, w, None],
                                          name="input_gt_masks",
                                          dtype=bool)
                pass

            # anchor
            anchors = self.anchor_utils.get_anchors(self.image_shape)

            # Duplicate across the batch dimension because Keras requires it
            # TODO: can this be optimized to avoid duplicating the anchors?
            anchors = np.broadcast_to(anchors,
                                      (self.batch_size, ) + anchors.shape)
            # A hack to get around Keras's bad support for constants
            anchors = kl.Lambda(lambda x: tf.Variable(anchors),
                                name="anchors")(input_image)

            anchors = kl.Lambda(lambda x: tf.Variable(anchors),
                                name="anchors")(input_image)
            pass

        else:
            # Anchors in normalized coordinates
            anchors = kl.Input(shape=[None, 4], name="input_anchors")

            # 上面训练用到的参数,测试不需要,但是在 if else 里面定义一下,免得 undefined
            input_rpn_match = None
            input_rpn_bbox = None
            input_gt_class_ids = None
            gt_boxes = None
            input_gt_boxes = None
            input_gt_masks = None
            pass

        # Build the shared convolutional layers.
        # Bottom-up Layers
        # Returns a list of the last layers of each stage, 5 in total.
        # Don't create the thead (stage 5), so we pick the 4th item in the list.
        _, c2, c3, c4, c5 = backbone.resnet_graph(input_image,
                                                  self.backbone,
                                                  stage5=True)

        # Top-down Layers
        # TODO: add assert to varify feature map sizes match what's in config
        p5 = kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5)
        p4 = kl.Add(name="fpn_p4add")([
            kl.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(p5),
            kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4)
        ])
        p3 = kl.Add(name="fpn_p3add")([
            kl.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(p4),
            kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3)
        ])
        p2 = kl.Add(name="fpn_p2add")([
            kl.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(p3),
            kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2)
        ])

        # Attach 3x3 conv to all P layers to get the final feature maps.
        p2 = kl.Conv2D(self.top_down_pyramid_size, (3, 3),
                       padding="SAME",
                       name="fpn_p2")(p2)
        p3 = kl.Conv2D(self.top_down_pyramid_size, (3, 3),
                       padding="SAME",
                       name="fpn_p3")(p3)
        p4 = kl.Conv2D(self.top_down_pyramid_size, (3, 3),
                       padding="SAME",
                       name="fpn_p4")(p4)
        p5 = kl.Conv2D(self.top_down_pyramid_size, (3, 3),
                       padding="SAME",
                       name="fpn_p5")(p5)
        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        p6 = kl.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(p5)

        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [p2, p3, p4, p5, p6]
        mrcnn_feature_maps = [p2, p3, p4, p5]

        # RPN Model
        rpn = common.build_rpn_model(self.rpn_anchor_stride,
                                     len(self.rpn_anchor_ratios),
                                     self.top_down_pyramid_size)

        # Loop through pyramid layers
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(rpn([p]))
            pass

        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
        outputs = list(zip(*layer_outputs))
        outputs = [
            kl.Concatenate(axis=1, name=n)(list(o))
            for o, n in zip(outputs, output_names)
        ]

        rpn_class_logits, rpn_class, rpn_bbox = outputs

        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        proposal_count = cfg.TRAIN.POST_NMS_ROIS if self.train_flag else cfg.TEST.POST_NMS_ROIS

        rpn_rois = common.ProposalLayer(
            proposal_count=proposal_count,
            nms_threshold=self.rpn_nms_threshold,
            batch_size=self.batch_size,
            name="ROI")([rpn_class, rpn_bbox, anchors])

        fc_layer_size = cfg.COMMON.FPN_CLASS_FC_LAYERS_SIZE
        pool_size = cfg.COMMON.POOL_SIZE
        mask_pool_size = cfg.COMMON.MASK_POOL_SIZE
        train_or_freeze = cfg.COMMON.TRAIN_FLAG

        if self.train_flag:

            # Class ID mask to mark class IDs supported by the dataset the image
            # came from.
            active_class_ids = kl.Lambda(
                lambda x: self.image_utils.parse_image_meta_graph(x)[
                    "active_class_ids"])(input_image_meta)

            if not cfg.TRAIN.USE_RPN_ROIS:
                # Ignore predicted ROIs and use ROIs provided as an input.
                input_rois = kl.Input(shape=[proposal_count, 4],
                                      name="input_roi",
                                      dtype=np.int32)
                # Normalize coordinates
                target_rois = kl.Lambda(
                    lambda x: self.bbox_util.norm_boxes_graph(
                        x,
                        k.shape(input_image)[1:3]))(input_rois)
            else:
                target_rois = rpn_rois
                input_rois = None

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            rois, target_class_ids, target_bbox, target_mask = \
                common.DetectionTargetLayer(self.batch_size, name="proposal_targets")([
                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

            # Network Heads
            # TODO: verify that this handles zero padded ROIs
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph(
                rois,
                mrcnn_feature_maps,
                input_image_meta,
                pool_size,
                self.class_num,
                train_flag=train_or_freeze,
                fc_layers_size=fc_layer_size)

            mrcnn_mask = common.build_fpn_mask_graph(
                rois,
                mrcnn_feature_maps,
                input_image_meta,
                mask_pool_size,
                self.class_num,
                train_flag=train_or_freeze)

            # TODO: clean up (use tf.identify if necessary)
            output_rois = kl.Lambda(lambda x: x * 1, name="output_rois")(rois)

            # Losses
            rpn_class_loss = kl.Lambda(
                lambda x: common.rpn_class_loss_graph(*x),
                name="rpn_class_loss")([input_rpn_match, rpn_class_logits])
            rpn_bbox_loss = kl.Lambda(
                lambda x: common.rpn_bbox_loss_graph(self.batch_size, *x),
                name="rpn_bbox_loss")(
                    [input_rpn_bbox, input_rpn_match, rpn_bbox])
            class_loss = kl.Lambda(lambda x: common.mrcnn_class_loss_graph(*x),
                                   name="mrcnn_class_loss")([
                                       target_class_ids, mrcnn_class_logits,
                                       active_class_ids
                                   ])
            bbox_loss = kl.Lambda(lambda x: common.mrcnn_bbox_loss_graph(*x),
                                  name="mrcnn_bbox_loss")([
                                      target_bbox, target_class_ids, mrcnn_bbox
                                  ])
            mask_loss = kl.Lambda(lambda x: common.mrcnn_mask_loss_graph(*x),
                                  name="mrcnn_mask_loss")([
                                      target_mask, target_class_ids, mrcnn_mask
                                  ])

            # Model
            inputs = [
                input_image, input_image_meta, input_rpn_match, input_rpn_bbox,
                input_gt_class_ids, input_gt_boxes, input_gt_masks
            ]

            if not cfg.TRAIN.USE_RPN_ROIS:
                inputs.append(input_rois)

            outputs = [
                rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits,
                mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois,
                rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss
            ]
            model = km.Model(inputs, outputs, name='mask_rcnn')
            pass
        else:
            # Network Heads
            # Proposal classifier and BBox regressor heads
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph(
                rpn_rois,
                mrcnn_feature_maps,
                input_image_meta,
                pool_size,
                self.class_num,
                train_flag=train_or_freeze,
                fc_layers_size=fc_layer_size)

            # Detections
            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
            # normalized coordinates
            detections = common.DetectionLayer(self.batch_size,
                                               name="mrcnn_detection")([
                                                   rpn_rois, mrcnn_class,
                                                   mrcnn_bbox, input_image_meta
                                               ])

            # Create masks for detections
            detection_boxes = kl.Lambda(lambda x: x[..., :4])(detections)
            mrcnn_mask = common.build_fpn_mask_graph(
                detection_boxes,
                mrcnn_feature_maps,
                input_image_meta,
                mask_pool_size,
                self.class_num,
                train_flag=train_or_freeze)

            model = km.Model([input_image, input_image_meta, anchors], [
                detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois,
                rpn_class, rpn_bbox
            ],
                             name='mask_rcnn')
            pass

        # Add multi-GPU support. 多 GPU 操作
        gpu_count = cfg.COMMON.GPU_COUNT
        if gpu_count > 1:
            from m_rcnn.parallel_model import ParallelModel
            model = ParallelModel(model, gpu_count)

        return model
        pass

    def load_weights(self, model_path, by_name=False, exclude=None):
        """
            Modified version of the corresponding Keras function
            with the addition of multi-GPU support and the ability
            to exclude some layers from loading.
        :param model_path:
        :param by_name:
        :param exclude: list of layer names to exclude
        :return:
        """

        if exclude:
            by_name = True
            pass

        if h5py is None:
            raise ImportError('`load_weights` requires h5py.')
            pass

        model_file = h5py.File(model_path, mode='r')

        if 'layer_names' not in model_file.attrs and 'model_weights' in model_file:
            model_file = model_file['model_weights']

        # In multi-GPU training, we wrap the model. Get layers
        # of the inner model because they have the weights.
        keras_model = self.keras_model

        layers = keras_model.inner_model.layers if hasattr(
            keras_model, "inner_model") else keras_model.layers
        print("layers: {}".format(layers))

        # Exclude some layers
        if exclude:
            layers = filter(lambda l: l.name not in exclude, layers)

        if by_name:
            saving.load_weights_from_hdf5_group_by_name(model_file, layers)
        else:
            saving.load_weights_from_hdf5_group(model_file, layers)
        if hasattr(model_file, 'close'):
            model_file.close()
        pass

    def generate_random_rois(self, image_shape, count, gt_boxes):
        """
            Generates ROI proposals similar to what a region proposal network
            would generate.
        :param image_shape: [Height, Width, Depth]
        :param count: Number of ROIs to generate
        :param gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
        :return:
        """
        # placeholder
        rois = np.zeros((count, 4), dtype=np.int32)

        # Generate random ROIs around GT boxes (90% of count)
        rois_per_box = int(0.9 * count / gt_boxes.shape[0])
        for i in range(gt_boxes.shape[0]):
            gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
            h = gt_y2 - gt_y1
            w = gt_x2 - gt_x1
            # random boundaries
            r_y1 = max(gt_y1 - h, 0)
            r_y2 = min(gt_y2 + h, image_shape[0])
            r_x1 = max(gt_x1 - w, 0)
            r_x2 = min(gt_x2 + w, image_shape[1])

            # To avoid generating boxes with zero area, we generate double what
            # we need and filter out the extra. If we get fewer valid boxes
            # than we need, we loop and try again.
            while True:
                y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
                x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
                # Filter out zero area boxes
                threshold = 1
                y1y2 = y1y2[np.abs(y1y2[:, 0] -
                                   y1y2[:, 1]) >= threshold][:rois_per_box]
                x1x2 = x1x2[np.abs(x1x2[:, 0] -
                                   x1x2[:, 1]) >= threshold][:rois_per_box]
                if y1y2.shape[0] == rois_per_box and x1x2.shape[
                        0] == rois_per_box:
                    break

            # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
            # into x1, y1, x2, y2 order
            x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
            y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
            box_rois = np.hstack([y1, x1, y2, x2])
            rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois

        # Generate random ROIs anywhere in the image (10% of count)
        remaining_count = count - (rois_per_box * gt_boxes.shape[0])
        # To avoid generating boxes with zero area, we generate double what
        # we need and filter out the extra. If we get fewer valid boxes
        # than we need, we loop and try again.
        while True:
            y1y2 = np.random.randint(0, image_shape[0],
                                     (remaining_count * 2, 2))
            x1x2 = np.random.randint(0, image_shape[1],
                                     (remaining_count * 2, 2))
            # Filter out zero area boxes
            threshold = 1
            y1y2 = y1y2[np.abs(y1y2[:, 0] -
                               y1y2[:, 1]) >= threshold][:remaining_count]
            x1x2 = x1x2[np.abs(x1x2[:, 0] -
                               x1x2[:, 1]) >= threshold][:remaining_count]
            if y1y2.shape[0] == remaining_count and x1x2.shape[
                    0] == remaining_count:
                break

        # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
        # into x1, y1, x2, y2 order
        x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
        y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
        global_rois = np.hstack([y1, x1, y2, x2])
        rois[-remaining_count:] = global_rois

        return rois
        pass

    def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes,
                                gt_masks):
        """
            Generate targets for training Stage 2 classifier and mask heads.
            This is not used in normal training. It's useful for debugging or to train
            the Mask RCNN heads without using the RPN head.
        :param rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
        :param gt_class_ids: [instance count] Integer class IDs
        :param gt_boxes: [instance count, (y1, x1, y2, x2)]
        :param gt_masks: [height, width, instance count] Ground truth masks. Can be full
                        size or mini-masks.
        :return:
            rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
            class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
            bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
                    bbox refinements.
            masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
                   to bbox boundaries and resized to neural network output size.
        """
        assert rpn_rois.shape[0] > 0
        assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
            gt_class_ids.dtype)
        assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
            gt_boxes.dtype)
        assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
            gt_masks.dtype)

        # It's common to add GT Boxes to ROIs but we don't do that here because
        # according to XinLei Chen's paper, it doesn't help.

        # Trim empty padding in gt_boxes and gt_masks parts
        instance_ids = np.where(gt_class_ids > 0)[0]
        assert instance_ids.shape[0] > 0, "Image must contain instances."
        gt_class_ids = gt_class_ids[instance_ids]
        gt_boxes = gt_boxes[instance_ids]
        gt_masks = gt_masks[:, :, instance_ids]

        # Compute areas of ROIs and ground truth boxes.
        # rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1])
        # gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1])

        # Compute overlaps [rpn_rois, gt_boxes]
        overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
        for i in range(overlaps.shape[1]):
            gt = gt_boxes[i]
            overlaps[:, i] = self.bbox_util.compute_iou(gt, rpn_rois)
            pass

        # Assign ROIs to GT boxes
        rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
        rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]),
                                   rpn_roi_iou_argmax]

        # GT box assigned to each ROI
        rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
        rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]

        # Positive ROIs are those with >= 0.5 IoU with a GT box.
        fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]

        # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
        # TODO: To hard example mine or not to hard example mine, that's the question
        # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
        bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]

        # Subsample ROIs. Aim for 33% foreground.
        # FG
        fg_roi_count = int(self.rois_per_image * self.roi_positive_ratio)
        if fg_ids.shape[0] > fg_roi_count:
            keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
        else:
            keep_fg_ids = fg_ids
        # BG
        remaining = self.rois_per_image - keep_fg_ids.shape[0]
        if bg_ids.shape[0] > remaining:
            keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
        else:
            keep_bg_ids = bg_ids
        # Combine indices of ROIs to keep
        keep = np.concatenate([keep_fg_ids, keep_bg_ids])
        # Need more?
        remaining = self.rois_per_image - keep.shape[0]

        if remaining > 0:
            # Looks like we don't have enough samples to maintain the desired
            # balance. Reduce requirements and fill in the rest. This is
            # likely different from the Mask RCNN paper.

            # There is a small chance we have neither fg nor bg samples.
            if keep.shape[0] == 0:
                # Pick bg regions with easier IoU threshold
                bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
                assert bg_ids.shape[0] >= remaining
                keep_bg_ids = np.random.choice(bg_ids,
                                               remaining,
                                               replace=False)
                assert keep_bg_ids.shape[0] == remaining
                keep = np.concatenate([keep, keep_bg_ids])
            else:
                # Fill the rest with repeated bg rois.
                keep_extra_ids = np.random.choice(keep_bg_ids,
                                                  remaining,
                                                  replace=True)
                keep = np.concatenate([keep, keep_extra_ids])
        assert keep.shape[0] == self.rois_per_image, \
            "keep doesn't match ROI batch size {}, {}".format(keep.shape[0], self.rois_per_image)

        # Reset the gt boxes assigned to BG ROIs.
        rpn_roi_gt_boxes[keep_bg_ids, :] = 0
        rpn_roi_gt_class_ids[keep_bg_ids] = 0

        # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
        rois = rpn_rois[keep]
        roi_gt_boxes = rpn_roi_gt_boxes[keep]
        roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
        roi_gt_assignment = rpn_roi_iou_argmax[keep]

        # Class-aware bbox deltas. [y, x, log(h), log(w)]
        bboxes = np.zeros((self.rois_per_image, self.class_num, 4),
                          dtype=np.float32)
        pos_ids = np.where(roi_gt_class_ids > 0)[0]
        bboxes[pos_ids,
               roi_gt_class_ids[pos_ids]] = self.bbox_util.box_refinement(
                   rois[pos_ids], roi_gt_boxes[pos_ids, :4])
        # Normalize bbox refinements
        bbox_std_dev = np.array(cfg.COMMON.BBOX_STD_DEV)
        bboxes /= bbox_std_dev

        # Generate class-specific target masks
        masks = np.zeros((self.rois_per_image, self.image_shape[0],
                          self.image_shape[1], self.class_num),
                         dtype=np.float32)

        for i in pos_ids:
            class_id = roi_gt_class_ids[i]
            assert class_id > 0, "class id must be greater than 0"
            gt_id = roi_gt_assignment[i]
            class_mask = gt_masks[:, :, gt_id]

            if cfg.TRAIN.USE_MINI_MASK:
                # Create a mask placeholder, the size of the image
                placeholder = np.zeros(self.image_shape[:2], dtype=bool)
                # GT box
                gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
                gt_w = gt_x2 - gt_x1
                gt_h = gt_y2 - gt_y1
                # Resize mini mask to size of GT box
                placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \
                    np.round(self.image_utils.resize(class_mask, (gt_h, gt_w))).astype(bool)
                # Place the mini batch in the placeholder
                class_mask = placeholder

            # Pick part of the mask and resize it
            y1, x1, y2, x2 = rois[i].astype(np.int32)
            m = class_mask[y1:y2, x1:x2]
            mask = self.image_utils.resize(m, self.image_shape)
            masks[i, :, :, class_id] = mask

        return rois, roi_gt_class_ids, bboxes, masks
        pass

    # #############################################################################################
    # test
    # #############################################################################################

    def detect(self, images_info_list, verbose=0):
        """
            Runs the detection pipeline.
        :param images_info_list: List of images, potentially of different sizes.
        :param verbose:
        :return: a list of dicts, one dict per image. The dict contains:
            rois: [N, (y1, x1, y2, x2)] detection bounding boxes
            class_ids: [N] int class IDs
            scores: [N] float probability scores for the class IDs
            masks: [H, W, N] instance binary masks
        """
        if verbose:
            print("processing {} image_info.".format(len(images_info_list)))
            for image_info in images_info_list:
                print("image_info: {}".format(image_info))
                pass
            pass

        # Mold inputs to format expected by the neural network
        molded_images_list, image_metas_list, windows_list = self.image_utils.mode_input(
            images_info_list)

        # Validate image sizes
        # All images in a batch MUST be of the same size
        image_shape = molded_images_list[0].shape
        for g in molded_images_list[1:]:
            assert g.shape == image_shape, \
                "After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes."
            pass

        # Anchors
        anchors = self.anchor_utils.get_anchors(image_shape)
        # Duplicate across the batch dimension because Keras requires it
        # TODO: can this be optimized to avoid duplicating the anchors?
        anchors = np.broadcast_to(anchors,
                                  (cfg.TEST.BATCH_SIZE, ) + anchors.shape)

        if verbose:
            print("molded_images_list: ", molded_images_list)
            print("image_metas_list: ", image_metas_list)
            print("anchors: ", anchors)
            pass

        # Run object detection
        detections, _, _, mrcnn_mask, _, _, _ = \
            self.keras_model.predict([molded_images_list, image_metas_list, anchors], verbose=0)
        # Process detections
        results_list = []
        for i, image_info in enumerate(images_info_list):
            molded_image_shape = molded_images_list[i].shape
            final_rois, final_class_ids, final_scores, final_masks = self.un_mold_detections(
                detections[i], mrcnn_mask[i], image_info.shape,
                molded_image_shape, windows_list[i])
            results_list.append({
                "rois": final_rois,
                "class_ids": final_class_ids,
                "scores": final_scores,
                "masks": final_masks,
            })
        return results_list
        pass

    def un_mold_detections(self, detections, mrcnn_mask, original_image_shape,
                           image_shape, window):
        """
            Reformats the detections of one image from the format of the neural
            network output to a format suitable for use in the rest of the
            application.
        :param detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
        :param mrcnn_mask: [N, height, width, num_classes]
        :param original_image_shape: [H, W, C] Original image shape before resizing
        :param image_shape: [H, W, C] Shape of the image after resizing and padding
        :param window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real
                        image is excluding the padding.
        :return:
            boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
            class_ids: [N] Integer class IDs for each bounding box
            scores: [N] Float probability scores of the class_id
            masks: [height, width, num_instances] Instance masks
        """
        # How many detections do we have?
        # Detections array is padded with zeros. Find the first class_id == 0.
        zero_ix = np.where(detections[:, 4] == 0)[0]
        n = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]

        # Extract boxes, class_ids, scores, and class-specific masks
        boxes = detections[:n, :4]
        class_ids = detections[:n, 4].astype(np.int32)
        scores = detections[:n, 5]
        masks = mrcnn_mask[np.arange(n), :, :, class_ids]

        # Translate normalized coordinates in the resized image to pixel
        # coordinates in the original image before resizing
        window = self.bbox_util.norm_boxes(window, image_shape[:2])
        wy1, wx1, wy2, wx2 = window
        shift = np.array([wy1, wx1, wy1, wx1])
        wh = wy2 - wy1  # window height
        ww = wx2 - wx1  # window width
        scale = np.array([wh, ww, wh, ww])
        # Convert boxes to normalized coordinates on the window
        boxes = np.divide(boxes - shift, scale)
        # Convert boxes to pixel coordinates on the original image
        boxes = self.bbox_util.denorm_boxes(boxes, original_image_shape[:2])

        # Filter out detections with zero area. Happens in early training when
        # network weights are still random
        exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) *
                              (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
        if exclude_ix.shape[0] > 0:
            boxes = np.delete(boxes, exclude_ix, axis=0)
            class_ids = np.delete(class_ids, exclude_ix, axis=0)
            scores = np.delete(scores, exclude_ix, axis=0)
            masks = np.delete(masks, exclude_ix, axis=0)
            n = class_ids.shape[0]

        # Resize masks to original image size and set boundary threshold.
        full_masks = []
        for i in range(n):
            # Convert neural network mask to full size mask
            full_mask = self.mask_util.unmold_mask(masks[i], boxes[i],
                                                   original_image_shape)
            full_masks.append(full_mask)
            pass
        full_masks = np.stack(
            full_masks,
            axis=-1) if full_masks else np.empty(original_image_shape[:2] +
                                                 (0, ))

        return boxes, class_ids, scores, full_masks
        pass