def _strict_pad_image( image, boxes, height, width, masks=None, value=IMAGENET_MEAN): """Always pad image to the desired height and width uniformly with the given pixel value. First draw a canvas of size [height, width] filled with pixel value `value`, and then place the input image in the center, and update the boxes coordinates (and optionally masks) to the new frame. NOTE: no padding will be performed in the height and/or width dimension if the desired size is less than that of the image. Args: image: float tensor of shape [height_in, width_in, channels]. boxes: float tensor of shape [num_boxes, 4], where each row contains normalized (i.e. values varying in [0, 1]) box coordinates: [ymin, xmin, ymax, xmax]. height: float scalar, the desired height of padded image. width: float scalar, the desired width of padded image. masks: (Optional) a tensor of shape [num_boxes, height, width], holding binary masks of `num_boxes` instances. value: float tensor of shape [3], RGB value to fill the padded region with. Returns: new_image: float tensor of shape [height, width, channels]. new_boxes: float tensor of shape [num_boxes, 4]. new_masks: (Optional) float tensor of shape [num_boxes, height, width]. """ value = tf.to_float(value) img_height, img_width, _ = tf.unstack(tf.shape(image)) img_height, img_width = tf.to_float(img_height), tf.to_float(img_width) # no padding in height and/or width dimension if desired height and/or width # is less than that of the image height = tf.maximum(height, img_height) width = tf.maximum(width, img_width) pad_up = (height - img_height) // 2 pad_down = height - img_height - pad_up pad_left = (width - img_width) // 2 pad_right = width - img_width - pad_left # pad image image -= value new_image = tf.pad(image, [[pad_up, pad_down], [pad_left, pad_right], [0, 0]]) new_image += value # pad boxes window = -pad_up, -pad_left, img_height + pad_down, img_width + pad_right normalizer = img_height, img_width, img_height, img_width window = tf.to_float(window) / tf.to_float(normalizer) new_boxes = box_list_ops.change_coordinate_frame( box_list.BoxList(boxes), window).get() # pad masks if masks is not None: new_masks = tf.pad(masks, [[0, 0], [pad_up, pad_down], [pad_left, pad_right]]) return new_image, new_boxes, new_masks return new_image, new_boxes
def _create_localization_targets(self, anchors_boxlist, gt_boxlist, match): """Creates localization targets for a single image. Args: anchors_boxlist: a BoxList instance, holding float tensor of shape [num_anchors, 4] as the anchor boxes coordinates for a single image. gt_boxlist: a BoxList instance, holding float tensor of shape [num_gt_boxes, 4] as the groundtruth boxes coordinates for a single image. match: a Match instance. Returns: loc_targets: a float tensor of shape [num_anchors, 4]. """ unmatched_loc_target = self._dummy_localization_target() ignored_loc_target = unmatched_loc_target loc_targets = match.gather_based_on_match( gt_boxlist.get(), unmatched_value=unmatched_loc_target, ignored_value=ignored_loc_target) loc_targets_boxlist = box_list.BoxList(loc_targets) # BoxLists `loc_targets_boxlist` and `anchors_boxlist` have one-to-one # correspondonse loc_targets = self._box_coder.encode(loc_targets_boxlist, anchors_boxlist) return loc_targets
def batch_decode(batch_box_encodings, anchor_boxlist_list, box_coder): """Decode a batch of box encodings w.r.t. anchors to box coordinates. Args: batch_box_encodings: a float tensor of shape [batch_size, num_anchors, num_classes, 4] holding box encoding predictions. anchors_boxlist_list: a list of BoxList instance holding float tensor of shape [num_anchors, 4] as anchor box coordinates. Lenght is equal to `batch_size`. box_coder: a BoxCoder instance to decode anchor-encoded location predictions into box coordinate predictions. Returns: decoded_boxes: a float tensor of shape [batch_size, num_anchors, num_classes, 4]. """ shape = shape_utils.combined_static_and_dynamic_shape(batch_box_encodings) box_encodings_list = [tf.reshape(box_encoding, [-1, box_coder.code_size]) for box_encoding in tf.unstack(batch_box_encodings, axis=0)] # tile anchors in the 1st dimension to `shape[2]`(i.e. num of classes) anchor_boxlist_list = [box_list.BoxList( tf.reshape(tf.tile(tf.expand_dims(anchor_boxlist.get(), 1), [1, shape[2], 1]), [-1, box_coder.code_size])) for anchor_boxlist in anchor_boxlist_list] decoded_boxes = [] for box_encodings, anchor_boxlist in zip( box_encodings_list, anchor_boxlist_list): decoded_boxes.append(box_coder.decode(box_encodings, anchor_boxlist).get()) decoded_boxes = tf.reshape(decoded_boxes, shape) return decoded_boxes
def _decode(self, rel_codes, anchors): """Decode relative encoding of box coordinates back to coordinates. Args: rel_codes: a tensor of shape [num_boxes, 4] where each row holds the anchor-encoded box coordinates in ty, tx, th, tw format. anchors: a BoxList holding `num_boxes` anchors that `rel_codes` are decoded relative to. Returns: boxlist: a BoxList holding `num_boxes` boxes. """ ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ty, tx, th, tw = tf.unstack(rel_codes, axis=1) if self._scale_factors: ty /= self._scale_factors[0] tx /= self._scale_factors[1] th /= self._scale_factors[2] tw /= self._scale_factors[3] ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a h = tf.exp(th) * ha w = tf.exp(tw) * wa # convert box coordinates back to ymin, xmin, ymax, xmax format. ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. return box_list.BoxList(tf.stack([ymin, xmin, ymax, xmax], axis=1))
def concatenate(boxlists, scope=None): """Concatenate a list of BoxLists. Each BoxList in the list must have the same set of fields, and the tensor stored in each field must have the same rank, and the same fully defined shape except for possibly the 0th dimension (i.e. num of boxes). This function will create a brand new BoxList. Args: boxlists: a list of BoxLists, holding `n_1`, `n_2`, ..., `n_b` boxes. scope: string scalar, name scope. Returns: a BoxList holding `sum(n_1, n_2, ..., n_b)` boxes, along with the additional fields holding `b` tensors concatenated along the 0th dimension. """ with tf.name_scope(scope, 'Concatenate'): concatenated = box_list.BoxList( tf.concat([boxlist.get() for boxlist in boxlists], 0)) fields = boxlists[0].get_extra_fields() for field in fields: concatenated_field = tf.concat( [boxlist.get_field(field) for boxlist in boxlists], 0) concatenated.set_field(field, concatenated_field) return concatenated
def tile_anchors(grid_height, grid_width, scales, aspect_ratios, anchor_stride, anchor_offset, base_anchor_size=(1.0, 1.0)): """Create a tiled set of anchors strided along a grid in image space. Args: grid_height: int scalar or int scalar tensor, height of the grid. grid_width: int scalar or int scalar tensor, width of the grid. scales: a list of floats, the scales of anchors. aspect_ratios: a list of floats, the aspect ratios. Has the same length as `scales`. anchor_stride: a 2-tuple of float scalars or float scalar tensors, the distance between neighboring grid centers in height and width dimension. anchor_offset: a 2-tuple of float scalars or float scalar tensors, the (height, width) coordinate of the upper left grid. base_anchor_size: a float tensor of shape [2], holding height and width of the anchor. Defaults to unit square. Returns: a BoxList instance holding `grid_height * grid_width * len(scales)` anchor boxes. """ base_anchor_size = tf.convert_to_tensor(base_anchor_size) ratio_sqrts = tf.sqrt(aspect_ratios) heights = scales / ratio_sqrts * base_anchor_size[0] widths = scales * ratio_sqrts * base_anchor_size[1] y_centers = tf.to_float(tf.range(grid_height)) y_centers = y_centers * anchor_stride[0] + anchor_offset[0] x_centers = tf.to_float(tf.range(grid_width)) x_centers = x_centers * anchor_stride[1] + anchor_offset[1] # x_centers, y_centers = ops.meshgrid(x_centers, y_centers) # widths_grid, x_centers_grid = ops.meshgrid(widths, x_centers) # heights_grid, y_centers_grid = ops.meshgrid(heights, y_centers) # bbox_centers = tf.stack([y_centers_grid, x_centers_grid], axis=3) # bbox_sizes = tf.stack([heights_grid, widths_grid], axis=3) # bbox_centers = tf.reshape(bbox_centers, [-1, 2]) # bbox_sizes = tf.reshape(bbox_sizes, [-1, 2]) # bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes) # return box_list.BoxList(bbox_corners) # x_centers, y_centers = tf.meshgrid(x_centers, y_centers) y_centers, x_centers = tf.meshgrid(y_centers, x_centers, indexing='ij') y_centers = tf.reshape(y_centers, [-1, 1]) x_centers = tf.reshape(x_centers, [-1, 1]) heights = tf.reshape(heights, [1, -1]) widths = tf.reshape(widths, [1, -1]) coordinates = tf.reshape( tf.stack([ y_centers - .5 * heights, x_centers - .5 * widths, y_centers + .5 * heights, x_centers + .5 * widths ], axis=2), [-1, 4]) return box_list.BoxList(coordinates)
def sample_frcnn_minibatch(model, batch_proposal_boxes, batch_num_proposals, gt_boxlist_list): """Sample a minibatch of proposal boxes to send to Fast RCNN at training time. The decoded, nms'ed, and clipped proposal boxes from RPN are further sampled to an even smaller set to be used for extracting ROI feature maps for Fast RCNN. Note: the sampling takes into account the label of each proposal boxes, determined by target assigner. So we DON'T have to run target assigner AGAIN. Args: model: an instance of DetectionModel. batch_proposal_boxes: float tensor of shape [batch_size, max_num_proposals, 4], nms'ed proposals, potentially padded. batch_num_proposals: int tensor of shape [batch_size], holding actual number of non-padded proposal boxes. gt_boxlist_list: a list of BoxList instances, each holding `num_gt_boxes` groundtruth boxes, with extra field 'labels' holding float tensor of shape [num_gt_boxes, num_class + 1] (groundtruth boxes labels). Length of list is equal to `batch_size`. Returns: proposal_boxlist_list: a list of BoxList instances, each holding `max_num_proposals` proposal boxes (coordinates normalized). The fields are potentially zero-padded up to `max_num_proposals`. Length of list is equal to `batch_size`. batch_num_proposals: int tensor of shape [batch_size], holding num of sampled proposals in `proposal_boxlist_list`. """ proposal_boxlist_list = [] num_proposals_list = [] for proposal_boxes, num_proposals, gt_boxlist in zip( tf.unstack(batch_proposal_boxes), tf.unstack(batch_num_proposals), gt_boxlist_list): # unpadded proposal BoxList proposal_boxlist = box_list.BoxList(proposal_boxes[:num_proposals]) sampled_proposal_boxlist = _sample_frcnn_minibatch_per_image( model, proposal_boxlist, gt_boxlist) # re-pad the proposal boxes back to size `max_num_proposals` padded_proposal_boxlist = box_list_ops.pad_or_clip_box_list( sampled_proposal_boxlist, size=model._frcnn_minibatch_size) proposal_boxlist_list.append(padded_proposal_boxlist) num_proposals_list.append( tf.minimum(sampled_proposal_boxlist.num_boxes(), model._frcnn_minibatch_size)) return proposal_boxlist_list, tf.stack(num_proposals_list)
def preprocess_groundtruth(tensor_dict): """Package the groundtruth labels tensor and boxes tensor (and optionally masks)as a BoxList. Args: tensor_dict: a dict mapping from tensor names to list of tensors: { 'image': list of tensors of shape [height, width, channels], 'groundtruth_boxes': list of tensors of shape [num_gt_boxes, 4], 'groundtruth_labels': list of tensors of shape [num_gt_boxes] or [num_gt_boxes, num_classes], 'groundtruth_masks': (Optional) list of tensors of shape [num_gt_boxes, height, width] } Length of list is equal to batch size. Returns: gt_boxlist_list: a list of BoxList instance holding `num_gt_boxes` groundtruth boxes with extra field 'labels' (and optionally 'masks'). """ gt_boxes_list = tensor_dict[TensorDictFields.groundtruth_boxes] gt_labels_list = tensor_dict[TensorDictFields.groundtruth_labels] gt_masks_list = None if TensorDictFields.groundtruth_masks in tensor_dict: gt_masks_list = tensor_dict[TensorDictFields.groundtruth_masks] if len(gt_boxes_list) != len(gt_labels_list): raise ValueError('`gt_boxes_list` must have the same length of ' '`gt_labels_list`.') if gt_masks_list is not None and len(gt_masks_list) != len(gt_boxes_list): raise ValueError('`gt_masks_list` must have the same length of ' '`gt_boxes_list`.') gt_masks_list = gt_masks_list or [None] * len(gt_boxes_list) gt_boxlist_list = [] for gt_boxes, gt_labels, gt_masks in zip(gt_boxes_list, gt_labels_list, gt_masks_list): gt_boxlist = box_list.BoxList(gt_boxes) gt_boxlist.set_field('labels', gt_labels) if gt_masks is not None: gt_boxlist.set_field('masks', gt_masks) gt_boxlist_list.append(gt_boxlist) return gt_boxlist_list
def process_per_image_detection(image_list, detection_dict, gt_boxlist_list=None): """Processes the nms'ed, potentially padded detection results for a single image. Unpad the detection results and convert normalized coorindates into absolute coordinates. Args: image_list: a list of float tensors of shape [height, width, depth]. Length is equal to `batch_size`. detection_dict: a dict mapping from strings to tensors, holding the following entries: { 'boxes': float tensor of shape [batch_size, max_num_proposals, 4]. 'scores': float tensor of shape [batch_size, max_num_proposals]. 'classes': float tensor of shape [batch_size, max_num_proposals]. 'num_detections': int tensor of shape [batch_size], holding num of valid (not zero-padded) detections in each of the above tensors.} 'masks': (Optional) float tensor of shape [batch_size, max_num_proposals, mask_height, mask_width]. gt_boxlist_list: a list of BoxList instances, each holding `num_gt_boxes` groundtruth_boxes, with extra field 'labels' holding float tensor of shape [num_gt_boxes, num_classes + 1] (groundtruth boxes labels). Length of list is equal to `batch_size`. Returns: to_be_run_tensor_dict: a dict mapping from strings to tensors, holding the following entries: { 'image': uint8 tensor of shape [height, width, depth], holding the original image. 'boxes': float tensor of shape [num_val_detections, 4], holding coordinates of predicted boxes. 'scores': float tensor of shape [num_val_detections], holding predicted confidence scores. 'classes': int tensor of shape [num_val_detections], holding predicted class indices. 'gt_boxes': float tensor of shape [num_gt_boxes, 4], holding coordinates of groundtruth boxes. 'gt_labels': int tensor of shape [num_gt_boxes], holding groundtruth box class indices.} """ boxes = detection_dict['boxes'] scores = detection_dict['scores'] classes = tf.to_int32(detection_dict['classes']) num_detections = detection_dict['num_detections'] if len(image_list) != 1: raise ValueError('`image_list` must contain exactly one image tensor.') if not (boxes.shape[0].value == 1 and scores.shape[0].value == 1 and classes.shape[0].value == 1 and num_detections.shape[0].value == 1): raise ValueError( '`boxes`, `scores`, `classes`, `num_detections` must have' 'size 1 in the 0th dimension (i.e. batch size).') if gt_boxlist_list is not None and len(gt_boxlist_list) != 1: raise ValueError( '`gt_boxlist_list` must contain exactly one groundtruth ' 'BoxList.') boxes, scores, classes, num_detections, image = (boxes[0], scores[0], classes[0], num_detections[0], image_list[0]) boxes, classes, scores = (boxes[:num_detections], classes[:num_detections], scores[:num_detections]) height, width = tf.unstack(tf.shape(image)[:2]) if 'masks' in detection_dict: # [max_num_proposals, mask_height, mask_width] masks = detection_dict['masks'][0][:num_detections] image_size_masks = ops.to_image_size_masks(masks, boxes, height, width) image_size_masks = tf.cast(image_size_masks > 0.5, tf.uint8) boxes = box_list_ops.to_absolute_coordinates(box_list.BoxList(boxes), height, width).get() to_be_run_tensor_dict = { 'image': tf.cast(image, tf.uint8), 'boxes': boxes, 'scores': scores, 'classes': classes } if 'masks' in detection_dict: to_be_run_tensor_dict['masks'] = image_size_masks if gt_boxlist_list is not None: gt_boxlist = gt_boxlist_list[0] gt_boxes = box_list_ops.to_absolute_coordinates( gt_boxlist, height, width).get() gt_labels = tf.argmax(gt_boxlist.get_field('labels'), axis=1, output_type=tf.int32) to_be_run_tensor_dict['gt_boxes'] = gt_boxes to_be_run_tensor_dict['gt_labels'] = gt_labels if gt_boxlist.has_field('masks'): to_be_run_tensor_dict['gt_masks'] = gt_boxlist.get_field('masks') return to_be_run_tensor_dict
def compute_rpn_loss(model, rpn_prediction_dict, gt_boxlist_list): """Compute the localization and classification (objectness) loss of RPN. Args: model: an instance of DetectionModel. rpn_prediction_dict: a dict mapping from strings to tensors/BoxList. Must hold the following entries: { 'box_encoding_predictions': float tensor of shape [batch_size, num_anchors, 1, 4], 'objectness_predictions': float tensor of shape [batch_size, num_anchors, 2], 'anchor_boxlist_list': a list of BoxList instance, each holding `num_anchors` anchor boxes. Length is equal to `batch_size`.} gt_boxlist_list: a list of BoxList instances, each holding `num_gt_boxes` groundtruth boxes. No extra field holding groundtruth class labels are needed, as they will be generated for RPN. Length of list is equal to `batch_size`. Returns: rpn_losses_dict: a tensor dict mapping from strings to tensors, holding the following entries, { 'loc_loss': float scalar tensor, proposal box localization loss. 'cls_loss': float scalar tensor, proopsal box objectness (classification) loss} """ rpn_box_encoding_predictions = tf.squeeze( rpn_prediction_dict['box_encoding_predictions'], axis=2) rpn_objectness_predictions = rpn_prediction_dict['objectness_predictions'] anchors_boxlist_list = rpn_prediction_dict['anchor_boxlist_list'] with tf.name_scope('RPNLoss'): batch_size = len(gt_boxlist_list) rpn_gt_boxlist_list = [] # generate objectness labels for rpn_gt_boxlist for gt_boxlist in gt_boxlist_list: gt_boxlist = box_list.BoxList(gt_boxlist.get()) gt_boxlist.set_field( 'labels', tf.tile([[0., 1.]], [gt_boxlist.num_boxes(), 1])) rpn_gt_boxlist_list.append(gt_boxlist) (batch_loc_targets, batch_loc_weights, batch_cls_targets, batch_cls_weights, _, _) = target_assigner.batch_assign_targets(model._rpn_target_assigner, anchors_boxlist_list, rpn_gt_boxlist_list) def rpn_minibatch_subsample_fn(args): cls_targets, cls_weights = args cls_targets = cls_targets[:, -1] return [ model._rpn_minibatch_sampler_fn(tf.cast(cls_weights, tf.bool), model._rpn_minibatch_size, tf.cast(cls_targets, tf.bool)) ] # indicator of shape [batch_size, num_anchors], where each row sum to # `rpn_minibatch_size`. # indicating anchors for which objectness losses are computed. batch_sampled_indicator = tf.to_float( shape_utils.static_map_fn(rpn_minibatch_subsample_fn, [batch_cls_targets, batch_cls_weights])) # indicator of shape [batch_size, num_anchors], where each row sum to # value <= `rpn_minibatch_size` * pos_frac. # indicator anchors for which localization losses are computed sampled_loc_indicator = batch_sampled_indicator * batch_loc_weights # [batch_size] sample_sizes = tf.reduce_sum(batch_sampled_indicator, axis=1) # [batch_size, num_anchors] loc_losses = model._rpn_localization_loss_fn( rpn_box_encoding_predictions, batch_loc_targets, weights=sampled_loc_indicator) # [batch_size, num_anchors] cls_losses = model._rpn_classification_loss_fn( rpn_objectness_predictions, batch_cls_targets, weights=batch_sampled_indicator) # normalize loc and cls loss of shape [batch_size, num_anchors] over anchors # in a image, and over images in a batch loc_loss = tf.reduce_mean( tf.reduce_sum(loc_losses, axis=1) / sample_sizes) cls_loss = tf.reduce_mean( tf.reduce_sum(cls_losses, axis=1) / sample_sizes) loc_loss = tf.multiply(loc_loss, model._rpn_localization_loss_weight, name='rpn_loc_loss') cls_loss = tf.multiply(cls_loss, model._rpn_classification_loss_weight, name='rpn_cls_loss') return {'loc_loss': loc_loss, 'cls_loss': cls_loss}
def postprocess_rpn(self, rpn_prediction_dict, gt_boxlist_list=None): """Postprocess output tensors from RPN. The proposal box encoding predictions from RPN will be decoded w.r.t. anchors they are associated with, and will go through non-max suppression. If run at training time, the nms'ed proposals will be further sampled to a smaller set before being used to extract ROI features in the next stage. Note the output list of proposal BoxLists are potentially zero-padded because of the NMS. The actual num of valid proposals are indicated in `num_proposals`. Args: rpn_prediction_dict: a dict mapping from strings to tensors/BoxLists. Must hold the following entries: { 'box_encoding_predictions': float tensor of shape [batch_size, num_anchors, 1, 4], 'objectness_predictions': float tensor of shape [batch_size, num_anchors, 2], 'anchor_boxlist_list': a list of BoxList instance, each holding `num_anchors` anchor boxes. Length is equal to `batch_size`.} gt_boxlist_list: a list of BoxList instances, each holding `num_gt_boxes` groundtruth boxes, with extra 'labels' field holding float tensor of shape [num_gt_boxes, num_classes + 1] (groundtruth boxes labels). Length of list is equal to `batch_size`. Must be provided at training time. Returns: rpn_detection_dict: a dict mapping from strings to tensors/BoxLists, holding the following entries: { 'proposal_boxlist_list': a list of BoxList instances, each holding `max_num_proposals` proposal boxes (coordinates normalized). The fields are potentially zero-padded up to `max_num_proposals`. Length of list is equal to `batch_size`. 'num_proposals': int tensor of shape [batch_size], holding the actual num of valid boxes (not zero-padded) in each BoxList of `proposal_boxlist_list`.} """ if self.is_training and gt_boxlist_list is None: raise ValueError( '`gt_boxlist_list` must be provided at training time.') box_encoding_predictions = rpn_prediction_dict[ 'box_encoding_predictions'] objectness_predictions = rpn_prediction_dict['objectness_predictions'] anchor_boxlist_list = rpn_prediction_dict['anchor_boxlist_list'] batch_size = objectness_predictions.shape[0].value # [batch_size, num_anchors, 1, 4] proposal_boxes = box_coder.batch_decode(box_encoding_predictions, anchor_boxlist_list, self._box_coder) objectness_predictions = self._rpn_score_conversion_fn( objectness_predictions)[:, :, 1:] # proposal_boxes: [batch_size, max_num_proposals, 4] # nms'ed proposals for each image in a batch, potentially padded # num_proposals: [batch_size] # actual number of non-padded proposal boxes for each image in a batch (proposal_boxes, _, _, num_proposals) = self._rpn_nms_fn( proposal_boxes, objectness_predictions, clip_window=ops.get_unit_square(batch_size)) proposal_boxlist_list = [ box_list.BoxList(proposal) for proposal in tf.unstack(proposal_boxes) ] rpn_detection_dict = { 'proposal_boxlist_list': proposal_boxlist_list, 'num_proposals': num_proposals } if self.is_training: proposal_boxes = tf.stop_gradient(proposal_boxes) # samples an even smaller set of nms'ed proposals for Fast RCNN 300->64 proposal_boxlist_list, num_proposals = commons.sample_frcnn_minibatch( self, proposal_boxes, num_proposals, gt_boxlist_list) rpn_detection_dict = { 'proposal_boxlist_list': proposal_boxlist_list, 'num_proposals': num_proposals } return rpn_detection_dict
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, scope=None): """Performs multiclass version of non maximum suppression on a single image. The multiclass NMS is performed in two stages: 1. NMS is performed on boxes independently for each class, where boxes are filtered by score, clipped to a window, before going through NMS. Note that NMS will cap the total number of nms'ed boxes to a given size. 2. Then the nms'ed boxes over all classes are merged, and sorted in descending order by their class-specific scores, and only the top scoring boxes are retained. Note it is required that `boxes` and `scores` have matched `num_classes` -- `shape(boxes)[1] == shape(scores)[1]`. If different classes (> 1) share the same set of box encodings (e.g. SSD, in which case shape(boxes)[1] == 1), the caller of this function needs to tile `boxes` to have size `num_classes` in the 1st dimension. Args: boxes: float tensor of shape [num_boxes, num_classes, 4], holding box coordinates for each of the `num_classes` classes. scores: float tensor of shape [num_boxes, num_classes], holding box scores for each of the `num_classes` classes. score_thresh: float scalar, boxes with score < `score_thresh` are removed. iou_thresh: float scalar, IOU threshold for non-max suppression. Must be in [0.0, 1.0]. max_size_per_class: int scalar, max num of retained boxes per class after NMS. max_total_size: int scalar, max num of boxes retained over all classes. clip_window: float tensor of shape [4], holding ymin, xmin, ymax, xmax of a clip window. scope: string scalar, name scope. Returns: sorted_boxlist: a BoxList instance holding up to `max_total_size` boxes, with extra fields 'scores' (float tensor of shape [num_boxes]), 'classes' (int tensor of shape [num_boxes]), where `num_boxes` <= `max_total_size`. Note this BoxList contains boxes from all classes and they are sorted in descending order of their class-specific score. """ if boxes.shape[1].value is None or scores.shape[1].value is None: raise ValueError('`shape(boxes)[1]` and `shape(scores)[1]` must be ' 'statically defined.') if boxes.shape[1].value != scores.shape[1].value: raise ValueError( '`shape(boxes)[1]` must be equal to `shape(scores)[1]`. ') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_classes = boxes.shape[1].value selected_boxlist_list = [] per_class_boxes_list = tf.unstack(boxes, axis=1) per_class_scores_list = tf.unstack(scores, axis=1) # stage 1: class-wise non-max suppression for class_index, per_class_boxes, per_class_scores in zip( range(num_classes), per_class_boxes_list, per_class_scores_list): per_class_boxlist = box_list.BoxList(per_class_boxes) per_class_boxlist.set_field(BoxListFields.scores, per_class_scores) # filter out boxes with score < `score_thresh` boxlist_filtered = box_list_ops.filter_by_score( per_class_boxlist, score_thresh) # optionally clip boxes to clip_window if clip_window is not None: boxlist_filtered = box_list_ops.clip_to_window( boxlist_filtered, clip_window) max_selection_size = tf.minimum(max_size_per_class, boxlist_filtered.num_boxes()) # len(selected_indices) <= max_selection_size selected_indices = tf.image.non_max_suppression( boxlist_filtered.get(), boxlist_filtered.get_field(BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh) nmsed_boxlist = box_list_ops.gather(boxlist_filtered, selected_indices) nmsed_boxlist.set_field( BoxListFields.classes, tf.zeros_like(nmsed_boxlist.get_field(BoxListFields.scores)) + class_index + 1) selected_boxlist_list.append(nmsed_boxlist) # stage 2: merge nms'ed boxes from all classes selected_boxlist = box_list_ops.concatenate(selected_boxlist_list) sorted_boxlist = box_list_ops.sort_by_field(selected_boxlist, BoxListFields.scores) if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxlist.num_boxes()) sorted_boxlist = box_list_ops.gather(sorted_boxlist, tf.range(max_total_size)) return sorted_boxlist
def _strict_random_crop_image(image, boxes, labels, masks=None, min_object_covered=1.0, aspect_ratio_range=(0.75, 1.33), area_range=(0.1, 1.0), overlap_thresh=0.3): """Always performs a random crop. A random window is cropped out of `image`, and the groundtruth boxes (and optionally the masks) associated with the original image will be either removed, clipped or retained as is, depending on their relative location w.r.t. to the crop window. Note: you may end up getting a cropped image without any groundtruth boxes. If that is the case, the output boxes and labels would simply be empty tensors (i.e. 0th dimension has size 0). Args: image: a float tensor of shape [height, width, channels]. boxes: a float tensor of shape [num_boxes, 4], where each row contains normalized (i.e. values varying in [0, 1]) box coordinates: [ymin, xmin, ymax, xmax]. labels: int tensor of shape [num_boxes] holding object classes in `boxes`. masks: (Optional) a tensor of shape [num_boxes, height, width], holding binary masks of `num_boxes` instances. min_object_covered: float scalar, the cropped window must cover at least `min_object_covered` (ratio) of the area of at least one box in `boxes`. aspect_ratio_range: a float 2-tuple, lower and upper bound of the aspect ratio of cropped window. area_range: a float 2-tuple, lower and upper bound of the ratio between area of cropped window and area of the original image. overlap_thresh: float scalar, a groundtruth box in `boxes` is retained only if the cropped window's IOA w.r.t. it >= this threshold. Returns: new_image: float tensor of shape [new_height, new_width, channels] holding the window cropped out of input `image`. new_boxes: float tensor of shape [new_num_boxes, 4] holding new groundtruth boxes, with their [ymin, xmin, ymax, xmax] coordinates normalized and clipped to the crop window. new_labels: int tensor of shape [new_num_boxes] holding object classes in `new_boxes`. new_masks: (Optional) float tensor of shape [new_num_boxes, height, width], holding new instance masks corresponding to `new_boxes`. """ with tf.name_scope('RandomCropImage', values=[image, boxes, labels]): # crop_box.shape: [1, 1, 4] crop_begin, crop_size, crop_box = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(boxes, 0), # 0 change to 1 min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=100, use_image_if_no_bounding_boxes=True) window = tf.squeeze(crop_box) # BoxList shape: [N, 4] boxlist = box_list.BoxList(boxes) boxlist.set_field('labels', labels) # BoxList shape: [1, 4] crop_boxlist = box_list.BoxList(tf.squeeze(crop_box, [0])) # remove boxes that are completely outside of `window` boxlist, indices = box_list_ops.prune_completely_outside_window( boxlist, window) # remove boxes whose fraction of area that is overlapped with # `crop_boxlist` is less than `overlap_thresh` boxlist, in_window_indices = box_list_ops.prune_non_overlapping_boxes( boxlist, crop_boxlist, overlap_thresh) # change the coordinate of the remaining boxes new_boxlist = box_list_ops.change_coordinate_frame(boxlist, window) new_image = tf.slice(image, crop_begin, crop_size) new_image.set_shape([None, None, image.get_shape()[2]]) # clipping is necessary as some of new_boxes may extend beyond crop window new_boxes = tf.clip_by_value(new_boxlist.get(), clip_value_min=0.0, clip_value_max=1.0) new_labels = new_boxlist.get_field('labels') if masks is not None: in_window_masks = tf.gather(tf.gather(masks, indices), in_window_indices) new_masks = tf.splice(in_window_masks, [0, crop_begin[0], crop_begin[1]], [-1, crop_size[0], crop_size[1]]) return new_image, new_boxes, new_labels, new_masks return new_image, new_boxes, new_labels
def unbatch_padded_tensors(tensor_dict, static_shapes, keep_padded_list): """Unbatch and unpad a batch of padded tensors. This function first unbatch a tensor with an outer batch dimension into a list of unbatched tensors (with padding), and then unpad each tensor in the list by slicing out the portion containing non-padded values. You can optionally choose a subset of tensors (specifying their keys in `keep_padded_list`) so that these tensors will stay in padded form (e.g. 'image'). For example, given input tensor_dict {'image': tensor of shape [batch_size, height, width, 3], 'image_shape': tensor of shape [batch_size, 3], 'gt_boxes': tensor of shape [batch_size, num_boxes, 4], 'gt_boxes_shape': tensor of shape [batch_size, 2], 'gt_labels': tensor of shape [batch_size, num_boxes], 'gt_labels_shape': tensor of shape [batch_size, 1]} output tensor_dict would be {'image': a list of `batch_size` tensors of shape [height_i, width_i, 3], 'gt_boxes': a list of `batch_size` tensors of shape [num_boxes_i, 4], 'gt_labels': a list of `batch_size` tensors of shape [num_boxes_i]} Args: tensor_dict: a dict mapping from tensor names to tensors. The tensors contain both the original tensors and their runtime shapes. static_shapes: a dict mapping from tensor names to tf.TensorShape instances. Only contains original tensors. Used to set shapes for unpadded tensors. keep_padded_list: a list or tuple of strings, holding the keys to the tensor_dict for which the padded tensor will stay in padded form. Returns: sliced_tensor_dict: a dict with the same number of entries as `tensor_dict`, where each value of the dict is a list (with length batch_size) containing properly unpadded tensors as opposed to a single tensor in `tensor_dict`. """ tensors = collections.OrderedDict() shapes = collections.OrderedDict() for key, batched_tensor in tensor_dict.items(): unbatched_tensor_list = tf.unstack(batched_tensor) if TensorDictFields.runtime_shape_str in key: shapes[key] = unbatched_tensor_list else: tensors[key] = unbatched_tensor_list sliced_tensor_dict = collections.OrderedDict() for key in tensors.keys(): unbatched_tensor_list = tensors[key] unbatched_shape_list = shapes[key + TensorDictFields.runtime_shape_str] sliced_tensor_list = [] for unbatched_tensor, unbatched_shape in zip(unbatched_tensor_list, unbatched_shape_list): if key not in keep_padded_list: sliced_tensor = tf.slice(unbatched_tensor, tf.zeros_like(unbatched_shape), unbatched_shape) else: sliced_tensor = unbatched_tensor sliced_tensor.set_shape(static_shapes[key]) sliced_tensor_list.append(sliced_tensor) sliced_tensor_dict[key] = sliced_tensor_list # We need to adjust the groundtruth boxes to the new dimensions for padded # images (when `batch_size` > 1). Convert to absolute coordinates using # original dimensions, and convert back to normalized coordinates using # padded dimensions. batch_size = len(sliced_tensor_dict[TensorDictFields.groundtruth_boxes]) if batch_size > 1: print('asdfadsfasd') for i in range(batch_size): boxlist = box_list.BoxList( sliced_tensor_dict[TensorDictFields.groundtruth_boxes][i]) # original dimensions height, width = tf.unstack( shapes[TensorDictFields.image + TensorDictFields.runtime_shape_str][i][:-1]) boxlist = box_list_ops.to_absolute_coordinates( boxlist, height, width) # padded dimensions new_height, new_width = tf.unstack( tf.shape(sliced_tensor_dict[TensorDictFields.image][i])[:-1]) boxlist = box_list_ops.to_normalized_coordinates( boxlist, new_height, new_width) sliced_tensor_dict[ TensorDictFields.groundtruth_boxes][i] = boxlist.get() return sliced_tensor_dict
def compute_losses(model, prediction_dict, gt_boxlist_list): """Creates localization and classification loss. Args: model: an instance of DetectionModel . prediction_dict: a dict mapping from strings to tensors/BoxList. Must hold the following entries: { 'box_encoding_predictions': float tensor of shape [batch_size, num_anchors, 1, 4], 'class_predictions': float tensor of shape [batch_size, num_anchors, num_classes + 1]} gt_boxlist_list: a list of BoxList instances, each holding `num_gt_boxes` groundtruth_boxes, with extra field 'labels' holding float tensor of shape [num_gt_boxes, num_classes + 1] (groundtruth boxes labels). Length of list is equal to `batch_size`. Returns: losses_dict: a dict mapping from strings to tensors, holding the following entries: { 'loc_loss': float scalar tensor, 'cls_loss': float scalar tensor} """ box_encoding_predictions = prediction_dict[PredTensorDictFields.box_encoding_predictions] class_predictions = prediction_dict[PredTensorDictFields.class_predictions] anchors_boxlist_list = prediction_dict['anchor_boxlist_list'] with tf.name_scope('Loss'): (batch_loc_targets, batch_loc_weights, batch_cls_targets, batch_cls_weights, _, match_list) = target_assigner.batch_assign_targets( model.target_assigner, anchors_boxlist_list, gt_boxlist_list) loc_losses = model.localization_loss_fn( tf.squeeze(box_encoding_predictions, axis=2), batch_loc_targets, ignore_nan_targets=True, weights=batch_loc_weights) cls_losses = model.classification_loss_fn( class_predictions, batch_cls_targets, weights=batch_cls_weights) # scalar tensors: `loclization_loss`, `classification_loss` if model.hard_example_miner: decoded_boxes = box_coder.batch_decode( box_encoding_predictions, anchors_boxlist_list, model.box_coder) decoded_boxes_list = tf.unstack(tf.squeeze(decoded_boxes, axis=2)) decoded_boxlist_list = [box_list.BoxList(decoded_boxes) for decoded_boxes in decoded_boxes_list] mined_indicator = model.hard_example_miner( loc_losses=loc_losses, cls_losses=cls_losses, decoded_boxlist_list=decoded_boxlist_list, match_list=match_list) loc_losses = tf.multiply(loc_losses, mined_indicator) cls_losses = tf.multiply(cls_losses, mined_indicator) # sample_sizes = tf.maximum(tf.reduce_sum(mined_indicator, axis=1), 1) # sample_sizes = tf.maximum(tf.reduce_sum(batch_loc_weights, axis=1), 1) sample_sizes = tf.to_float(tf.maximum(tf.reduce_sum(batch_loc_weights), 1)) # loc_loss = tf.reduce_mean(tf.reduce_sum(loc_losses, axis=1) / sample_sizes) # cls_loss = tf.reduce_mean(tf.reduce_sum(cls_losses, axis=1) / sample_sizes) loc_loss = tf.reduce_sum(loc_losses) / sample_sizes cls_loss = tf.reduce_sum(cls_losses) / sample_sizes loc_loss = tf.multiply(loc_loss, model._localization_loss_weight, name='loc_loss') cls_loss = tf.multiply(cls_loss, model._classification_loss_weight, name='cls_loss') losses_dict = { LossTensorDictFields.localization_loss: loc_loss, LossTensorDictFields.classification_loss: cls_loss} return losses_dict