def _propose_rois_tpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (TPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ _, num_boxes = scores.get_shape().as_list() topk_limit = (num_boxes if num_boxes < rpn_pre_nms_topn else rpn_pre_nms_topn) scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] # Decode boxes w.r.t. anchors and transform to the absoluate coordinates. boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes that exceed the boundary. boxes = box_utils.clip_boxes(boxes, height, width) # Filter boxes that one side is less than rpn_min_size threshold. boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) # NMS. if rpn_nms_threshold > 0: scores, boxes = box_utils.sorted_non_max_suppression_padded( scores, boxes, max_output_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold) # Pick top-K post NMS'ed boxes. scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def multilevel_propose_rois(scores_outputs, box_outputs, all_anchors, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights, use_batched_nms=False): """Proposes RoIs given a group of candidates from different FPN levels. Args: scores_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4] all_anchors: an Anchors object that contains the all anchors. image_info: a tensor of shape [batch_size, 5] where the three columns encode the input image's [height, width, scale, original_height, original_width]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the network input size to the original image size. See dataloader.DetectionInputProcessor for details. The last two are original height and width. See dataloader.DetectionInputProcessor for details. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. use_batched_nms: whether use batched nms. The batched nms will use tf.combined_non_max_suppression, which is only available for CPU/GPU. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. rois: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] representing the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. """ with tf.name_scope('multilevel_propose_rois'): levels = scores_outputs.keys() scores = [] rois = [] anchor_boxes = all_anchors.get_unpacked_boxes() height = tf.expand_dims(image_info[:, 0:1], axis=-1) width = tf.expand_dims(image_info[:, 1:2], axis=-1) scale = tf.expand_dims(image_info[:, 2:3], axis=-1) for level in levels: with tf.name_scope('level_%d' % level): batch_size, feature_h, feature_w, num_anchors_per_location = ( scores_outputs[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(scores_outputs[level], [batch_size, num_boxes]) this_level_scores = tf.sigmoid(this_level_scores) this_level_boxes = tf.reshape(box_outputs[level], [batch_size, num_boxes, 4]) this_level_anchors = tf.cast(tf.reshape( tf.expand_dims(anchor_boxes[level], axis=0) * tf.ones([batch_size, 1, 1, 1]), [batch_size, num_boxes, 4]), dtype=this_level_scores.dtype) if use_batched_nms: propose_rois_fn = _propose_rois_gpu else: propose_rois_fn = _propose_rois_tpu this_level_scores, this_level_boxes = propose_rois_fn( this_level_scores, this_level_boxes, this_level_anchors, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights) scores.append(this_level_scores) rois.append(this_level_boxes) scores = tf.concat(scores, axis=1) rois = tf.concat(rois, axis=1) with tf.name_scope('roi_post_nms_topk'): post_nms_num_anchors = scores.shape[1] post_nms_topk_limit = min(post_nms_num_anchors, rpn_post_nms_topn) top_k_scores, top_k_rois = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[rois]) top_k_rois = top_k_rois[0] return top_k_scores, top_k_rois
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (GPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ batch_size, num_boxes = scores.get_shape().as_list() topk_limit = min(num_boxes, rpn_pre_nms_topn) boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, height, width) if rpn_min_size > 0.0: boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: # Normalize coordinates as combined_non_max_suppression currently # only support normalized coordinates. pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1]) boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False) boxes = box_utils.to_absolute_coordinates(boxes, height, width) else: scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def proposal_op(scores_outputs, box_outputs, all_anchors, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size): """Proposes RoIs for the second stage nets. This proposal op performs the following operations. 1. propose rois at each level. 2. collect all proposals. 3. keep rpn_post_nms_topn proposals by their sorted scores from the highest to the lowest. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py # pylint: disable=line-too-long Args: scores_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4] all_anchors: an Anchors object that contains the all anchors. image_info: a tensor of shape [batch_size, 5] where the three columns encode the input image's [height, width, scale, original_height, original_width]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the network input size to the original image size. See dataloader.DetectionInputProcessor for details. The last two are original height and width. See dataloader.DetectionInputProcessor for details. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. rois: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] representing the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. """ with tf.name_scope('proposal'): levels = scores_outputs.keys() scores = [] rois = [] anchor_boxes = all_anchors.get_unpacked_boxes() for level in levels: # Expands the batch dimension for anchors as anchors do not have batch # dimension. Note that batch_size is invariant across levels. batch_size = scores_outputs[level].shape[0] anchor_boxes_batch = tf.cast(tf.tile( tf.expand_dims(anchor_boxes[level], axis=0), [batch_size, 1, 1, 1]), dtype=scores_outputs[level].dtype) scores_per_level, boxes_per_level = _proposal_op_per_level( scores_outputs[level], box_outputs[level], anchor_boxes_batch, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, level) scores.append(scores_per_level) rois.append(boxes_per_level) scores = tf.concat(scores, axis=1) rois = tf.concat(rois, axis=1) with tf.name_scope('post_nms_topk'): # Selects the top-k rois, k being rpn_post_nms_topn or the number of total # anchors after non-max suppression. post_nms_num_anchors = scores.shape[1] post_nms_topk_limit = (post_nms_num_anchors if post_nms_num_anchors < rpn_post_nms_topn else rpn_post_nms_topn) top_k_scores, top_k_rois = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[rois]) top_k_rois = top_k_rois[0] top_k_scores = tf.stop_gradient(top_k_scores) top_k_rois = tf.stop_gradient(top_k_rois) return top_k_scores, top_k_rois
def _proposal_op_per_level(scores, boxes, anchor_boxes, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, level): """Proposes RoIs for the second stage nets. This proposal op performs the following operations. 1. for each location i in a (H, W) grid: generate A anchor boxes centered on cell i apply predicted bbox deltas to each of the A anchors at cell i 2. clip predicted boxes to image 3. remove predicted boxes with either height or width < threshold 4. sort all (proposal, score) pairs by score from highest to lowest 5. take the top rpn_pre_nms_topn proposals before NMS 6. apply NMS with a loose threshold (0.7) to the remaining proposals 7. take after_nms_topN proposals after NMS 8. return the top proposals Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/ops/generate_proposals.py # pylint: disable=line-too-long Args: scores: a tensor with a shape of [batch_size, height, width, num_anchors]. boxes: a tensor with a shape of [batch_size, height, width, num_anchors * 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, height, width, num_anchors * 4]. image_info: a tensor of shape [batch_size, 5] where the three columns encode the input image's [height, width, scale, original_height, original_width]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the network input size to the original image size. See dataloader.DetectionInputProcessor for details. The last two are original height and width. See dataloader.DetectionInputProcessor for details. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). level: a integer number for the level that the function operates on. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ with tf.name_scope('proposal-l%d' % level): # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take the top rpn_pre_nms_topn proposals before NMS batch_size, h, w, num_anchors = scores.get_shape().as_list() scores = tf.reshape(scores, [batch_size, -1]) boxes = tf.reshape(boxes, [batch_size, -1, 4]) # Map scores to [0, 1] for convenince of setting min score. scores = tf.sigmoid(scores) topk_limit = (h * w * num_anchors if h * w * num_anchors < rpn_pre_nms_topn else rpn_pre_nms_topn) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, -1, 4]) scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] # Transforms anchors into proposals via bbox transformations. boxes = box_utils.batch_decode_box_outputs_op(anchor_boxes, boxes) # 2. clip proposals to image (may result in proposals with zero area # that will be removed in the next step) boxes = box_utils.clip_boxes(boxes, image_info[:, :2]) # 3. remove predicted boxes with either height or width < min_size scores, boxes = box_utils.filter_boxes(scores, boxes, rpn_min_size, image_info) # 6. apply loose nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: scores, boxes = box_utils.sorted_non_max_suppression_padded( scores, boxes, max_output_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold) scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def generate_detections_per_image_op( cls_outputs, box_outputs, anchor_boxes, image_id, image_info, num_detections=100, pre_nms_num_detections=1000, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generates detections with model outputs and anchors. Args: cls_outputs: a Tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a Tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: an integer number to specify the image id. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] num_detections: Number of detections after NMS. pre_nms_num_detections: Number of candidates before NMS. nms_threshold: a float number to specify the threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: detection results in a tensor with each row representing [image_id, ymin, xmin, ymax, xmax, score, class] """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Removes background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=True) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape( box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] box_outputs = tf.gather_nd(box_outputs, tf.stack([top_k_indices, classes], axis=1)) # Applies bounding box regression to anchors. boxes = box_utils.batch_decode_box_outputs_op( tf.expand_dims(anchor_boxes, axis=0), tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0] boxes = box_utils.clip_boxes( tf.expand_dims(boxes, axis=0), tf.expand_dims(image_info[:2], axis=0))[0] classes = tf.tile(tf.reshape(classes, [1, pre_nms_num_detections]), [num_classes - 1, 1]) scores = tf.tile(tf.reshape(top_k_scores, [1, pre_nms_num_detections]), [num_classes - 1, 1]) boxes = tf.tile(tf.reshape(boxes, [1, pre_nms_num_detections, 4]), [num_classes - 1, 1, 1]) class_bitmask = tf.tile( tf.reshape(tf.range(num_classes-1), [num_classes - 1, 1]), [1, pre_nms_num_detections]) scores = tf.where(tf.equal(classes, class_bitmask), scores, tf.zeros_like(scores)) scores = tf.where(tf.greater(scores, 0.05), scores, tf.zeros_like(scores)) # Reshape classes to be compartible with the top_k function. classes = tf.reshape(classes, [num_classes -1, pre_nms_num_detections, 1]) scores, sorted_tensors = box_utils.top_k( scores, k=pre_nms_num_detections, tensors=[boxes, classes]) boxes = sorted_tensors[0] classes = tf.reshape(sorted_tensors[1], [num_classes - 1, pre_nms_num_detections]) (post_nms_scores, post_nms_boxes, idx) = non_max_suppression.non_max_suppression_padded( scores, boxes, max_output_size=num_detections, iou_threshold=nms_threshold, level=0) # Sorts all results. sorted_scores, sorted_indices = tf.nn.top_k( tf.to_float(tf.reshape(post_nms_scores, [-1])), k=num_detections, sorted=True) post_nms_boxes = tf.gather(tf.reshape(post_nms_boxes, [-1, 4]), sorted_indices) classes = tf.batch_gather(classes, idx) post_nms_classes = tf.gather(tf.reshape(classes, [-1]), sorted_indices) + 1 if isinstance(image_id, int): image_id = tf.constant(image_id) image_id = tf.reshape(image_id, []) detections_result = tf.stack( [ tf.to_float(tf.fill(tf.shape(sorted_scores), image_id)), post_nms_boxes[:, 0], post_nms_boxes[:, 1], post_nms_boxes[:, 2], post_nms_boxes[:, 3], sorted_scores, tf.to_float(post_nms_classes), ], axis=1) return detections_result