def resize_and_crop_boxes(boxes, image_scale, output_size, offset): """Resizes boxes to output size with scale and offset. Args: boxes: `Tensor` of shape [N, 4] representing ground truth boxes. image_scale: 2D float `Tensor` representing scale factors that apply to [height, width] of input image. output_size: 2D `Tensor` or `int` representing [height, width] of target output image size. offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled boxes. Returns: boxes: `Tensor` of shape [N, 4] representing the scaled boxes. """ with tf.name_scope('resize_and_crop_boxes'): # Adjusts box coordinates based on image_scale and offset. boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Clips the boxes. boxes = box_ops.clip_boxes(boxes, output_size) return boxes
def resize_and_crop_boxes(boxes, image_scale, output_size, offset, box_history): """Resizes and crops the boxes. Args: boxes: A `Tensor` for the boxes. image_scale: A `Tensor` for the scaling factor of the image. output_size: A `list` of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. offset: A `Tensor` for how much translation was applied to the image. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. Returns: clipped_boxes: A `Tensor` representing the augmented boxes. box_history: A `Tensor` representing the augmented box_history. """ # Shift and scale the input boxes. boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Check the hitory of the boxes. box_history *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) box_history -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Clip the shifted and scaled boxes. clipped_boxes = bbox_ops.clip_boxes(boxes, output_size) return clipped_boxes, box_history
def _mosaic_crop_image(self, image, boxes, classes, is_crowd, area): """Process a patched image in preperation for final output.""" if self._mosaic_crop_mode != 'crop': shape = tf.cast(preprocessing_ops.get_image_shape(image), tf.float32) center = shape * self._mosaic_center # shift the center of the image by applying a translation to the whole # image ch = tf.math.round( preprocessing_ops.random_uniform_strong(-center[0], center[0], seed=self._seed)) cw = tf.math.round( preprocessing_ops.random_uniform_strong(-center[1], center[1], seed=self._seed)) # clip the boxes to those with in the image image = tfa.image.translate(image, [cw, ch], fill_value=self._pad_value) boxes = box_ops.denormalize_boxes(boxes, shape[:2]) boxes = boxes + tf.cast([ch, cw, ch, cw], boxes.dtype) boxes = box_ops.clip_boxes(boxes, shape[:2]) inds = box_ops.get_non_empty_box_indices(boxes) boxes = box_ops.normalize_boxes(boxes, shape[:2]) boxes, classes, is_crowd, area = self._select_ind( inds, boxes, classes, # pylint:disable=unbalanced-tuple-unpacking is_crowd, area) # warp and scale the fully stitched sample image, _, affine = preprocessing_ops.affine_warp_image( image, [self._output_size[0], self._output_size[1]], scale_min=self._aug_scale_min, scale_max=self._aug_scale_max, translate=self._aug_rand_translate, degrees=self._aug_rand_angle, perspective=self._aug_rand_perspective, random_pad=self._random_pad, seed=self._seed) height, width = self._output_size[0], self._output_size[1] image = tf.image.resize(image, (height, width)) # clip and clean boxes boxes, inds = preprocessing_ops.transform_and_clip_boxes( boxes, None, affine=affine, area_thresh=self._area_thresh, seed=self._seed) classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area) # pylint:disable=unbalanced-tuple-unpacking return image, boxes, classes, is_crowd, area, area
def _reorg_boxes(self, boxes, info, num_detections): """Scale and Clean boxes prior to Evaluation.""" mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1]) mask = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype) # Denormalize the boxes by the shape of the image inshape = tf.expand_dims(info[:, 1, :], axis=1) ogshape = tf.expand_dims(info[:, 0, :], axis=1) scale = tf.expand_dims(info[:, 2, :], axis=1) offset = tf.expand_dims(info[:, 3, :], axis=1) boxes = box_ops.denormalize_boxes(boxes, inshape) boxes = box_ops.clip_boxes(boxes, inshape) boxes += tf.tile(offset, [1, 1, 2]) boxes /= tf.tile(scale, [1, 1, 2]) boxes = box_ops.clip_boxes(boxes, ogshape) # Mask the boxes for usage boxes *= mask boxes += (mask - 1) return boxes
def affine_warp_boxes(affine, boxes, output_size, box_history): """Applies random rotation, random perspective change and random translation. and random scaling to the boxes. Args: affine: A `Tensor` for the augmenting matrix for the boxes. boxes: A `Tensor` for the boxes. output_size: A `list` of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. Returns: clipped_boxes: A `Tensor` representing the augmented boxes. box_history: A `Tensor` representing the augmented box_history. """ def _get_corners(box): """Get the corner of each box as a tuple of (x, y) coordinates.""" ymi, xmi, yma, xma = tf.split(box, 4, axis=-1) tl = tf.concat([xmi, ymi], axis=-1) bl = tf.concat([xmi, yma], axis=-1) tr = tf.concat([xma, ymi], axis=-1) br = tf.concat([xma, yma], axis=-1) return tf.concat([tl, bl, tr, br], axis=-1) def _corners_to_boxes(corner): """Convert (x, y) corners back into boxes [ymin, xmin, ymax, xmax].""" corner = tf.reshape(corner, [-1, 4, 2]) y = corner[..., 1] x = corner[..., 0] y_min = tf.reduce_min(y, axis=-1) x_min = tf.reduce_min(x, axis=-1) y_max = tf.reduce_max(y, axis=-1) x_max = tf.reduce_max(x, axis=-1) return tf.stack([y_min, x_min, y_max, x_max], axis=-1) def _aug_boxes(affine_matrix, box): """Apply an affine transformation matrix M to the boxes augmente boxes.""" corners = _get_corners(box) corners = tf.reshape(corners, [-1, 4, 2]) z = tf.expand_dims(tf.ones_like(corners[..., 1]), axis=-1) corners = tf.concat([corners, z], axis=-1) corners = tf.transpose(tf.matmul(affine_matrix, corners, transpose_b=True), perm=(0, 2, 1)) corners, p = tf.split(corners, [2, 1], axis=-1) corners /= p corners = tf.reshape(corners, [-1, 8]) box = _corners_to_boxes(corners) return box boxes = _aug_boxes(affine, boxes) box_history = _aug_boxes(affine, box_history) clipped_boxes = bbox_ops.clip_boxes(boxes, output_size) return clipped_boxes, box_history
def _call_box_outputs( self, images: tf.Tensor, image_shape: tf.Tensor, anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, gt_boxes: Optional[tf.Tensor] = None, gt_classes: Optional[tf.Tensor] = None, training: Optional[bool] = None ) -> Tuple[Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]: """Implementation of the Faster-RCNN logic for boxes.""" model_outputs = {} # Feature extraction. (backbone_features, decoder_features) = self._get_backbone_and_decoder_features(images) # Region proposal network. rpn_scores, rpn_boxes = self.rpn_head(decoder_features) model_outputs.update({ 'backbone_features': backbone_features, 'decoder_features': decoder_features, 'rpn_boxes': rpn_boxes, 'rpn_scores': rpn_scores }) # Generate anchor boxes for this batch if not provided. if anchor_boxes is None: _, image_height, image_width, _ = images.get_shape().as_list() anchor_boxes = anchor.Anchor( min_level=self._config_dict['min_level'], max_level=self._config_dict['max_level'], num_scales=self._config_dict['num_scales'], aspect_ratios=self._config_dict['aspect_ratios'], anchor_size=self._config_dict['anchor_size'], image_size=(image_height, image_width)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [tf.shape(images)[0], 1, 1, 1]) # Generate RoIs. current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes, image_shape, training) next_rois = current_rois all_class_outputs = [] for cascade_num in range(len(self.roi_sampler)): # In cascade RCNN we want the higher layers to have different regression # weights as the predicted deltas become smaller and smaller. regression_weights = self._cascade_layer_to_weights[cascade_num] current_rois = next_rois (class_outputs, box_outputs, model_outputs, matched_gt_boxes, matched_gt_classes, matched_gt_indices, current_rois) = self._run_frcnn_head( features=decoder_features, rois=current_rois, gt_boxes=gt_boxes, gt_classes=gt_classes, training=training, model_outputs=model_outputs, cascade_num=cascade_num, regression_weights=regression_weights) all_class_outputs.append(class_outputs) # Generate ROIs for the next cascade head if there is any. if cascade_num < len(self.roi_sampler) - 1: next_rois = box_ops.decode_boxes(tf.cast( box_outputs, tf.float32), current_rois, weights=regression_weights) next_rois = box_ops.clip_boxes( next_rois, tf.expand_dims(image_shape, axis=1)) if not training: if self._config_dict['cascade_class_ensemble']: class_outputs = tf.add_n(all_class_outputs) / len( all_class_outputs) detections = self.detection_generator( box_outputs, class_outputs, current_rois, image_shape, regression_weights, bbox_per_class=( not self._config_dict['class_agnostic_bbox_pred'])) model_outputs.update({ 'cls_outputs': class_outputs, 'box_outputs': box_outputs, }) if self.detection_generator.get_config()['apply_nms']: model_outputs.update({ 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] }) else: model_outputs.update({ 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] }) intermediate_outputs = { 'matched_gt_boxes': matched_gt_boxes, 'matched_gt_indices': matched_gt_indices, 'matched_gt_classes': matched_gt_classes, 'current_rois': current_rois, } return (model_outputs, intermediate_outputs)
def __call__(self, raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: tf.Tensor, image_shape: tf.Tensor, raw_attributes: Mapping[str, tf.Tensor] = None): """Generates final detections. Args: raw_boxes: A `dict` with keys representing FPN levels and values representing box tenors of shape `[batch, feature_h, feature_w, num_anchors * 4]`. raw_scores: A `dict` with keys representing FPN levels and values representing logit tensors of shape `[batch, feature_h, feature_w, num_anchors]`. anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing the corresponding anchor boxes w.r.t `box_outputs`. image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image height and width w.r.t. the scaled image, i.e. the same image space as `box_outputs` and `anchor_boxes`. raw_attributes: If not None, a `dict` of (attribute_name, attribute_prediction) pairs. `attribute_prediction` is a dict that contains keys representing FPN levels and values representing tenors of shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`. Returns: If `apply_nms` = True, the return is a dictionary with keys: `detection_boxes`: A `float` tf.Tensor of shape [batch, max_num_detections, 4] representing top detected boxes in [y1, x1, y2, x2]. `detection_scores`: A `float` tf.Tensor of shape [batch, max_num_detections] representing sorted confidence scores for detected boxes. The values are between [0, 1]. `detection_classes`: An `int` tf.Tensor of shape [batch, max_num_detections] representing classes for detected boxes. `num_detections`: An `int` tf.Tensor of shape [batch] only the first `num_detections` boxes are valid detections `detection_attributes`: A dict. Values of the dict is a `float` tf.Tensor of shape [batch, max_num_detections, attribute_size] representing attribute predictions for detected boxes. If `apply_nms` = False, the return is a dictionary with keys: `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] representing all the decoded boxes. `decoded_box_scores`: A `float` tf.Tensor of shape [batch, num_raw_boxes] representing socres of all the decoded boxes. `decoded_box_attributes`: A dict. Values in the dict is a `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size] representing attribute predictions of all the decoded boxes. """ # Collects outputs from all levels into a list. boxes = [] scores = [] if raw_attributes: attributes = {att_name: [] for att_name in raw_attributes.keys()} else: attributes = {} levels = list(raw_boxes.keys()) min_level = int(min(levels)) max_level = int(max(levels)) for i in range(min_level, max_level + 1): raw_boxes_i_shape = tf.shape(raw_boxes[str(i)]) batch_size = raw_boxes_i_shape[0] num_anchors_per_locations = raw_boxes_i_shape[-1] // 4 num_classes = tf.shape( raw_scores[str(i)])[-1] // num_anchors_per_locations # Applies score transformation and remove the implicit background class. scores_i = tf.sigmoid( tf.reshape(raw_scores[str(i)], [batch_size, -1, num_classes])) scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) # Box decoding. # The anchor boxes are shared for all data in a batch. # One stage detector only supports class agnostic box regression. anchor_boxes_i = tf.reshape(anchor_boxes[str(i)], [batch_size, -1, 4]) raw_boxes_i = tf.reshape(raw_boxes[str(i)], [batch_size, -1, 4]) boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i) # Box clipping. boxes_i = box_ops.clip_boxes( boxes_i, tf.expand_dims(image_shape, axis=1)) boxes.append(boxes_i) scores.append(scores_i) if raw_attributes: for att_name, raw_att in raw_attributes.items(): attribute_size = tf.shape( raw_att[str(i)])[-1] // num_anchors_per_locations att_i = tf.reshape(raw_att[str(i)], [batch_size, -1, attribute_size]) attributes[att_name].append(att_i) boxes = tf.concat(boxes, axis=1) boxes = tf.expand_dims(boxes, axis=2) scores = tf.concat(scores, axis=1) if raw_attributes: for att_name in raw_attributes.keys(): attributes[att_name] = tf.concat(attributes[att_name], axis=1) attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2) if not self._config_dict['apply_nms']: return { 'decoded_boxes': boxes, 'decoded_box_scores': scores, 'decoded_box_attributes': attributes, } if self._config_dict['use_batched_nms']: if raw_attributes: raise ValueError('Attribute learning is not supported for batched NMS.') nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_batched( boxes, scores, self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) # Set `nmsed_attributes` to None for batched NMS. nmsed_attributes = {} else: if raw_attributes: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes = ( _generate_detections_v1( boxes, scores, attributes=attributes if raw_attributes else None, pre_nms_top_k=self._config_dict['pre_nms_top_k'], pre_nms_score_threshold=self ._config_dict['pre_nms_score_threshold'], nms_iou_threshold=self._config_dict['nms_iou_threshold'], max_num_detections=self._config_dict['max_num_detections'])) else: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_v2( boxes, scores, self._config_dict['pre_nms_top_k'], self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) nmsed_attributes = {} # Adds 1 to offset the background class which has index 0. nmsed_classes += 1 return { 'num_detections': valid_detections, 'detection_boxes': nmsed_boxes, 'detection_classes': nmsed_classes, 'detection_scores': nmsed_scores, 'detection_attributes': nmsed_attributes, }
def __call__(self, raw_boxes: tf.Tensor, raw_scores: tf.Tensor, anchor_boxes: tf.Tensor, image_shape: tf.Tensor): """Generates final detections. Args: raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]` representing the class-specific box coordinates relative to anchors. raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]` representing the class logits before applying score activiation. anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing the corresponding anchor boxes w.r.t `box_outputs`. image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image height and width w.r.t. the scaled image, i.e. the same image space as `box_outputs` and `anchor_boxes`. Returns: If `apply_nms` = True, the return is a dictionary with keys: `detection_boxes`: A `float` tf.Tensor of shape [batch, max_num_detections, 4] representing top detected boxes in [y1, x1, y2, x2]. `detection_scores`: A `float` `tf.Tensor` of shape [batch, max_num_detections] representing sorted confidence scores for detected boxes. The values are between [0, 1]. `detection_classes`: An `int` tf.Tensor of shape [batch, max_num_detections] representing classes for detected boxes. `num_detections`: An `int` tf.Tensor of shape [batch] only the first `num_detections` boxes are valid detections If `apply_nms` = False, the return is a dictionary with keys: `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] representing all the decoded boxes. `decoded_box_scores`: A `float` tf.Tensor of shape [batch, num_raw_boxes] representing socres of all the decoded boxes. """ box_scores = tf.nn.softmax(raw_scores, axis=-1) # Removes the background class. box_scores_shape = tf.shape(box_scores) box_scores_shape_list = box_scores.get_shape().as_list() batch_size = box_scores_shape[0] num_locations = box_scores_shape_list[1] num_classes = box_scores_shape_list[-1] num_detections = num_locations * (num_classes - 1) box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1]) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_locations, num_classes, 4]) raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile( tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Box decoding. decoded_boxes = box_ops.decode_boxes( raw_boxes, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0]) # Box clipping decoded_boxes = box_ops.clip_boxes( decoded_boxes, tf.expand_dims(image_shape, axis=1)) decoded_boxes = tf.reshape(decoded_boxes, [batch_size, num_locations, num_classes - 1, 4]) if not self._config_dict['apply_nms']: return { 'decoded_boxes': decoded_boxes, 'decoded_box_scores': box_scores, } if self._config_dict['use_batched_nms']: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_batched( decoded_boxes, box_scores, self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) else: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_v2( decoded_boxes, box_scores, self._config_dict['pre_nms_top_k'], self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) # Adds 1 to offset the background class which has index 0. nmsed_classes += 1 return { 'num_detections': valid_detections, 'detection_boxes': nmsed_boxes, 'detection_classes': nmsed_classes, 'detection_scores': nmsed_scores, }
def call(self, images: tf.Tensor, image_shape: tf.Tensor, anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, gt_boxes: tf.Tensor = None, gt_classes: tf.Tensor = None, gt_masks: tf.Tensor = None, training: bool = None) -> Mapping[str, tf.Tensor]: model_outputs = {} # Feature extraction. features = self.backbone(images) if self.decoder: features = self.decoder(features) # Region proposal network. rpn_scores, rpn_boxes = self.rpn_head(features) model_outputs.update({ 'rpn_boxes': rpn_boxes, 'rpn_scores': rpn_scores }) # Generate anchor boxes for this batch if not provided. if anchor_boxes is None: _, image_height, image_width, _ = images.get_shape().as_list() anchor_boxes = anchor.Anchor( min_level=self._config_dict['min_level'], max_level=self._config_dict['max_level'], num_scales=self._config_dict['num_scales'], aspect_ratios=self._config_dict['aspect_ratios'], anchor_size=self._config_dict['anchor_size'], image_size=(image_height, image_width)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [tf.shape(images)[0], 1, 1, 1]) # Generate RoIs. current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes, image_shape, training) next_rois = current_rois all_class_outputs = [] for cascade_num in range(len(self.roi_sampler)): # In cascade RCNN we want the higher layers to have different regression # weights as the predicted deltas become smaller and smaller. regression_weights = self._cascade_layer_to_weights[cascade_num] current_rois = next_rois (class_outputs, box_outputs, model_outputs, matched_gt_boxes, matched_gt_classes, matched_gt_indices, current_rois) = self._run_frcnn_head( features=features, rois=current_rois, gt_boxes=gt_boxes, gt_classes=gt_classes, training=training, model_outputs=model_outputs, layer_num=cascade_num, regression_weights=regression_weights) all_class_outputs.append(class_outputs) # Generate ROIs for the next cascade head if there is any. if cascade_num < len(self.roi_sampler) - 1: next_rois = box_ops.decode_boxes(tf.cast( box_outputs, tf.float32), current_rois, weights=regression_weights) next_rois = box_ops.clip_boxes( next_rois, tf.expand_dims(image_shape, axis=1)) if not training: if self._config_dict['cascade_class_ensemble']: class_outputs = tf.add_n(all_class_outputs) / len( all_class_outputs) detections = self.detection_generator( box_outputs, class_outputs, current_rois, image_shape, regression_weights, bbox_per_class=( not self._config_dict['class_agnostic_bbox_pred'])) model_outputs.update({ 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'], }) if not self._include_mask: return model_outputs if training: current_rois, roi_classes, roi_masks = self.mask_sampler( current_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, gt_masks) roi_masks = tf.stop_gradient(roi_masks) model_outputs.update({ 'mask_class_targets': roi_classes, 'mask_targets': roi_masks, }) else: current_rois = model_outputs['detection_boxes'] roi_classes = model_outputs['detection_classes'] # Mask RoI align. mask_roi_features = self.mask_roi_aligner(features, current_rois) # Mask head. raw_masks = self.mask_head([mask_roi_features, roi_classes]) if training: model_outputs.update({ 'mask_outputs': raw_masks, }) else: model_outputs.update({ 'detection_masks': tf.math.sigmoid(raw_masks), }) return model_outputs
def _multilevel_propose_rois(raw_boxes, raw_scores, anchor_boxes, image_shape, pre_nms_top_k=2000, pre_nms_score_threshold=0.0, pre_nms_min_size_threshold=0.0, nms_iou_threshold=0.7, num_proposals=1000, use_batched_nms=False, decode_boxes=True, clip_boxes=True, apply_sigmoid_to_score=True): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Apply sigmoid transform if specified. b. Decode boxes if specified. c. Clip boxes if specified. d. Filter small boxes and those fall outside image if specified. e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. f. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: raw_boxes: A `dict` with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. raw_scores: A `dict` with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: A `dict` with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h * feature_w * num_anchors, 4]. image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep before applying NMS. Default: 2000. pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal box score to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. pre_nms_min_size_threshold: A `float` representing the minimal box size in each side (w.r.t. the scaled image) to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold used for NMS. If 0.0, no NMS is applied. Default: 0.7. num_proposals: An `int` of top scoring RPN proposals *in total* to keep after applying NMS. Default: 1000. use_batched_nms: A `bool` indicating whether NMS is applied in batch using `tf.image.combined_non_max_suppression`. Currently only available in CPU/GPU. Default is False. decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded using `anchor_boxes`. If False, use `raw_boxes` directly and ignore `anchor_boxes`. Default is True. clip_boxes: A `bool` indicating whether boxes are first clipped to the scaled image size before appliying NMS. If False, no clipping is applied and `image_shape` is ignored. Default is True. apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to `raw_scores` before applying NMS. Default is True. Returns: selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4], representing the box coordinates of the selected proposals w.r.t. the scaled image. selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1], representing the scores of the selected proposals. """ with tf.name_scope('multilevel_propose_rois'): rois = [] roi_scores = [] image_shape = tf.expand_dims(image_shape, axis=1) for level in sorted(raw_scores.keys()): with tf.name_scope('level_%s' % level): _, feature_h, feature_w, num_anchors_per_location = ( raw_scores[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes]) this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4]) this_level_anchors = tf.cast(tf.reshape( anchor_boxes[level], [-1, num_boxes, 4]), dtype=this_level_scores.dtype) if apply_sigmoid_to_score: this_level_scores = tf.sigmoid(this_level_scores) if decode_boxes: this_level_boxes = box_ops.decode_boxes( this_level_boxes, this_level_anchors) if clip_boxes: this_level_boxes = box_ops.clip_boxes( this_level_boxes, image_shape) if pre_nms_min_size_threshold > 0.0: this_level_boxes, this_level_scores = box_ops.filter_boxes( this_level_boxes, this_level_scores, image_shape, pre_nms_min_size_threshold) this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k) this_level_post_nms_top_k = min(num_boxes, num_proposals) if nms_iou_threshold > 0.0: if use_batched_nms: this_level_rois, this_level_roi_scores, _, _ = ( tf.image.combined_non_max_suppression( tf.expand_dims(this_level_boxes, axis=2), tf.expand_dims(this_level_scores, axis=-1), max_output_size_per_class= this_level_pre_nms_top_k, max_total_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold, score_threshold=pre_nms_score_threshold, pad_per_class=False, clip_boxes=False)) else: if pre_nms_score_threshold > 0.0: this_level_boxes, this_level_scores = ( box_ops.filter_boxes_by_scores( this_level_boxes, this_level_scores, pre_nms_score_threshold)) this_level_boxes, this_level_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) this_level_roi_scores, this_level_rois = ( nms.sorted_non_max_suppression_padded( this_level_scores, this_level_boxes, max_output_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold)) else: this_level_rois, this_level_roi_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_post_nms_top_k) rois.append(this_level_rois) roi_scores.append(this_level_roi_scores) all_rois = tf.concat(rois, axis=1) all_roi_scores = tf.concat(roi_scores, axis=1) with tf.name_scope('top_k_rois'): _, num_valid_rois = all_roi_scores.get_shape().as_list() overall_top_k = min(num_valid_rois, num_proposals) selected_rois, selected_roi_scores = box_ops.top_k_boxes( all_rois, all_roi_scores, k=overall_top_k) return selected_rois, selected_roi_scores
def _decode_multilevel_outputs( self, raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: tf.Tensor, image_shape: tf.Tensor, raw_attributes: Optional[Mapping[str, tf.Tensor]] = None): """Collects dict of multilevel boxes, scores, attributes into lists.""" boxes = [] scores = [] if raw_attributes: attributes = {att_name: [] for att_name in raw_attributes.keys()} else: attributes = {} levels = list(raw_boxes.keys()) min_level = int(min(levels)) max_level = int(max(levels)) for i in range(min_level, max_level + 1): raw_boxes_i = raw_boxes[str(i)] raw_scores_i = raw_scores[str(i)] batch_size = tf.shape(raw_boxes_i)[0] (_, feature_h_i, feature_w_i, num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list() num_locations = feature_h_i * feature_w_i num_anchors_per_locations = num_anchors_per_locations_times_4 // 4 num_classes = raw_scores_i.get_shape().as_list( )[-1] // num_anchors_per_locations # Applies score transformation and remove the implicit background class. scores_i = tf.sigmoid( tf.reshape(raw_scores_i, [ batch_size, num_locations * num_anchors_per_locations, num_classes ])) scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) # Box decoding. # The anchor boxes are shared for all data in a batch. # One stage detector only supports class agnostic box regression. anchor_boxes_i = tf.reshape( anchor_boxes[str(i)], [batch_size, num_locations * num_anchors_per_locations, 4]) raw_boxes_i = tf.reshape( raw_boxes_i, [batch_size, num_locations * num_anchors_per_locations, 4]) boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i) # Box clipping. boxes_i = box_ops.clip_boxes( boxes_i, tf.expand_dims(image_shape, axis=1)) boxes.append(boxes_i) scores.append(scores_i) if raw_attributes: for att_name, raw_att in raw_attributes.items(): attribute_size = raw_att[str( i)].get_shape().as_list()[-1] // num_anchors_per_locations att_i = tf.reshape(raw_att[str(i)], [ batch_size, num_locations * num_anchors_per_locations, attribute_size ]) attributes[att_name].append(att_i) boxes = tf.concat(boxes, axis=1) boxes = tf.expand_dims(boxes, axis=2) scores = tf.concat(scores, axis=1) if raw_attributes: for att_name in raw_attributes.keys(): attributes[att_name] = tf.concat(attributes[att_name], axis=1) attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2) return boxes, scores, attributes
def __call__(self, raw_boxes: tf.Tensor, raw_scores: tf.Tensor, anchor_boxes: tf.Tensor, image_shape: tf.Tensor, regression_weights: Optional[List[float]] = None, bbox_per_class: bool = True): """Generates final detections. Args: raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]` representing the class-specific box coordinates relative to anchors. raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]` representing the class logits before applying score activiation. anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing the corresponding anchor boxes w.r.t `box_outputs`. image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image height and width w.r.t. the scaled image, i.e. the same image space as `box_outputs` and `anchor_boxes`. regression_weights: A list of four float numbers to scale coordinates. bbox_per_class: A `bool`. If True, perform per-class box regression. Returns: If `apply_nms` = True, the return is a dictionary with keys: `detection_boxes`: A `float` tf.Tensor of shape [batch, max_num_detections, 4] representing top detected boxes in [y1, x1, y2, x2]. `detection_scores`: A `float` `tf.Tensor` of shape [batch, max_num_detections] representing sorted confidence scores for detected boxes. The values are between [0, 1]. `detection_classes`: An `int` tf.Tensor of shape [batch, max_num_detections] representing classes for detected boxes. `num_detections`: An `int` tf.Tensor of shape [batch] only the first `num_detections` boxes are valid detections If `apply_nms` = False, the return is a dictionary with keys: `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] representing all the decoded boxes. `decoded_box_scores`: A `float` tf.Tensor of shape [batch, num_raw_boxes] representing socres of all the decoded boxes. """ box_scores = tf.nn.softmax(raw_scores, axis=-1) # Removes the background class. box_scores_shape = tf.shape(box_scores) box_scores_shape_list = box_scores.get_shape().as_list() batch_size = box_scores_shape[0] num_locations = box_scores_shape_list[1] num_classes = box_scores_shape_list[-1] box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1]) if bbox_per_class: num_detections = num_locations * (num_classes - 1) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_locations, num_classes, 4]) raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile( tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Box decoding. decoded_boxes = box_ops.decode_boxes( raw_boxes, anchor_boxes, weights=regression_weights) # Box clipping decoded_boxes = box_ops.clip_boxes( decoded_boxes, tf.expand_dims(image_shape, axis=1)) if bbox_per_class: decoded_boxes = tf.reshape( decoded_boxes, [batch_size, num_locations, num_classes - 1, 4]) else: decoded_boxes = tf.expand_dims(decoded_boxes, axis=2) if not self._config_dict['apply_nms']: return { 'decoded_boxes': decoded_boxes, 'decoded_box_scores': box_scores, } # Optionally force the NMS be run on CPU. if self._config_dict['use_cpu_nms']: nms_context = tf.device('cpu:0') else: nms_context = contextlib.nullcontext() with nms_context: if self._config_dict['nms_version'] == 'batched': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( _generate_detections_batched( decoded_boxes, box_scores, self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) elif self._config_dict['nms_version'] == 'v1': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = ( _generate_detections_v1( decoded_boxes, box_scores, pre_nms_top_k=self._config_dict['pre_nms_top_k'], pre_nms_score_threshold=self ._config_dict['pre_nms_score_threshold'], nms_iou_threshold=self._config_dict['nms_iou_threshold'], max_num_detections=self._config_dict['max_num_detections'], soft_nms_sigma=self._config_dict['soft_nms_sigma'])) elif self._config_dict['nms_version'] == 'v2': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( _generate_detections_v2( decoded_boxes, box_scores, pre_nms_top_k=self._config_dict['pre_nms_top_k'], pre_nms_score_threshold=self ._config_dict['pre_nms_score_threshold'], nms_iou_threshold=self._config_dict['nms_iou_threshold'], max_num_detections=self._config_dict['max_num_detections'])) else: raise ValueError('NMS version {} not supported.'.format( self._config_dict['nms_version'])) # Adds 1 to offset the background class which has index 0. nmsed_classes += 1 return { 'num_detections': valid_detections, 'detection_boxes': nmsed_boxes, 'detection_classes': nmsed_classes, 'detection_scores': nmsed_scores, }
def __call__(self, raw_boxes, raw_scores, anchor_boxes, image_shape): """Generate final detections. Args: raw_boxes: a dict with keys representing FPN levels and values representing box tenors of shape [batch, feature_h, feature_w, num_anchors * 4]. raw_scores: a dict with keys representing FPN levels and values representing logit tensors of shape [batch, feature_h, feature_w, num_anchors]. anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the corresponding anchor boxes w.r.t `box_outputs`. image_shape: a tensor of shape of [batch_size, 2] storing the image height and width w.r.t. the scaled image, i.e. the same image space as `box_outputs` and `anchor_boxes`. Returns: If `apply_nms` = True, the return is a dictionary with keys: `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4] representing top detected boxes in [y1, x1, y2, x2]. `detection_scores`: float Tensor of shape [batch, max_num_detections] representing sorted confidence scores for detected boxes. The values are between [0, 1]. `detection_classes`: int Tensor of shape [batch, max_num_detections] representing classes for detected boxes. `num_detections`: int Tensor of shape [batch] only the first `num_detections` boxes are valid detections If `apply_nms` = False, the return is a dictionary with keys: `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4] representing all the decoded boxes. `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes] representing socres of all the decoded boxes. """ # Collects outputs from all levels into a list. boxes = [] scores = [] levels = list(raw_boxes.keys()) min_level = min(levels) max_level = max(levels) for i in range(min_level, max_level + 1): raw_boxes_i_shape = tf.shape(raw_boxes[i]) batch_size = raw_boxes_i_shape[0] num_anchors_per_locations = raw_boxes_i_shape[-1] // 4 num_classes = tf.shape( raw_scores[i])[-1] // num_anchors_per_locations # Applies score transformation and remove the implicit background class. scores_i = tf.sigmoid( tf.reshape(raw_scores[i], [batch_size, -1, num_classes])) scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) # Box decoding. # The anchor boxes are shared for all data in a batch. # One stage detector only supports class agnostic box regression. anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4]) raw_boxes_i = tf.reshape(raw_boxes[i], [batch_size, -1, 4]) boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i) # Box clipping. boxes_i = box_ops.clip_boxes(boxes_i, tf.expand_dims(image_shape, axis=1)) boxes.append(boxes_i) scores.append(scores_i) boxes = tf.concat(boxes, axis=1) boxes = tf.expand_dims(boxes, axis=2) scores = tf.concat(scores, axis=1) if not self._config_dict['apply_nms']: return { 'decoded_boxes': boxes, 'decoded_box_scores': scores, } if self._config_dict['use_batched_nms']: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_batched( boxes, scores, self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) else: nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( _generate_detections_v2( boxes, scores, self._config_dict['pre_nms_top_k'], self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) # Adds 1 to offset the background class which has index 0. nmsed_classes += 1 return { 'num_detections': valid_detections, 'detection_boxes': nmsed_boxes, 'detection_classes': nmsed_classes, 'detection_scores': nmsed_scores, }