def batch_position_sensitive_crop_regions(images, boxes, crop_size, num_spatial_bins, global_pool, parallel_iterations=64): """Position sensitive crop with batches of images and boxes. This op is exactly like `position_sensitive_crop_regions` below but operates on batches of images and boxes. See `position_sensitive_crop_regions` function below for the operation applied per batch element. Args: images: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`. A 4-D tensor of shape `[batch, image_height, image_width, depth]`. Both `image_height` and `image_width` need to be positive. boxes: A `Tensor` of type `float32`. A 3-D tensor of shape `[batch, num_boxes, 4]`. Each box is specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image height is mapped to `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. crop_size: See `position_sensitive_crop_regions` below. num_spatial_bins: See `position_sensitive_crop_regions` below. global_pool: See `position_sensitive_crop_regions` below. parallel_iterations: Number of batch items to process in parallel. Returns: """ def _position_sensitive_crop_fn(inputs): images, boxes = inputs return position_sensitive_crop_regions( images, boxes, crop_size=crop_size, num_spatial_bins=num_spatial_bins, global_pool=global_pool) return shape_utils.static_or_dynamic_map_fn( _position_sensitive_crop_fn, elems=[images, boxes], dtype=tf.float32, parallel_iterations=parallel_iterations)
def preprocess(self, inputs): """Feature-extractor specific preprocessing. SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during post-processing. On calling `preprocess` method, clip_window gets updated based on `true_image_shapes` returned by `image_resizer_fn`. Args: inputs: a [batch, height_in, width_in, channels] float tensor representing a batch of images with values between 0 and 255.0. Returns: preprocessed_inputs: a [batch, height_out, width_out, channels] float tensor representing a batch of images. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Raises: ValueError: if inputs tensor does not have type tf.float32 """ if inputs.dtype is not tf.float32: raise ValueError('`preprocess` expects a tf.float32 tensor') with tf.name_scope('Preprocessor'): # TODO(jonathanhuang): revisit whether to always use batch size as # the number of parallel iterations vs allow for dynamic batching. outputs = shape_utils.static_or_dynamic_map_fn( self._image_resizer_fn, elems=inputs, dtype=[tf.float32, tf.int32]) resized_inputs = outputs[0] true_image_shapes = outputs[1] return (self._feature_extractor.preprocess(resized_inputs), true_image_shapes)
def normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=32): """Converts a batch of boxes from normal to image coordinates. Args: normalized_boxes: a float32 tensor of shape [None, num_boxes, 4] in normalized coordinates. image_shape: a float32 tensor of shape [4] containing the image shape. parallel_iterations: parallelism for the map_fn op. Returns: absolute_boxes: a float32 tensor of shape [None, num_boxes, 4] containing the boxes in image coordinates. """ x_scale = tf.cast(image_shape[2], tf.float32) y_scale = tf.cast(image_shape[1], tf.float32) def _to_absolute_coordinates(normalized_boxes): y_min, x_min, y_max, x_max = tf.split(value=normalized_boxes, num_or_size_splits=4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxes = tf.concat([y_min, x_min, y_max, x_max], 1) return scaled_boxes absolute_boxes = shape_utils.static_or_dynamic_map_fn( _to_absolute_coordinates, elems=(normalized_boxes), dtype=tf.float32, parallel_iterations=parallel_iterations, back_prop=True) return absolute_boxes
def loss(self, prediction_dict, true_image_shapes, scope=None): """Compute scalar loss tensors with respect to provided groundtruth. Calling this function requires that groundtruth tensors have been provided via the provide_groundtruth function. Args: prediction_dict: a dictionary holding prediction tensors with 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 2) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. scope: Optional scope name. Returns: a dictionary mapping loss keys (`localization_loss` and `classification_loss`) to scalar tensors representing corresponding loss values. """ with tf.name_scope(scope, 'Loss', prediction_dict.values()): keypoints = None if self.groundtruth_has_field(fields.BoxListFields.keypoints): keypoints = self.groundtruth_lists( fields.BoxListFields.keypoints) weights = None if self.groundtruth_has_field(fields.BoxListFields.weights): weights = self.groundtruth_lists(fields.BoxListFields.weights) (batch_cls_targets, batch_cls_weights, batch_reg_targets, batch_reg_weights, match_list) = self._assign_targets( self.groundtruth_lists(fields.BoxListFields.boxes), self.groundtruth_lists(fields.BoxListFields.classes), keypoints, weights) if self._add_summaries: self._summarize_target_assignment( self.groundtruth_lists(fields.BoxListFields.boxes), match_list) if self._random_example_sampler: batch_sampled_indicator = tf.to_float( shape_utils.static_or_dynamic_map_fn( self._minibatch_subsample_fn, [batch_cls_targets, batch_cls_weights], dtype=tf.bool, parallel_iterations=self._parallel_iterations, back_prop=True)) batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights) batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights) location_losses = self._localization_loss( prediction_dict['box_encodings'], batch_reg_targets, ignore_nan_targets=True, weights=batch_reg_weights) cls_losses = ops.reduce_sum_trailing_dimensions( self._classification_loss( prediction_dict['class_predictions_with_background'], batch_cls_targets, weights=batch_cls_weights), ndims=2) if self._hard_example_miner: (localization_loss, classification_loss) = self._apply_hard_mining( location_losses, cls_losses, prediction_dict, match_list) if self._add_summaries: self._hard_example_miner.summarize() else: if self._add_summaries: class_ids = tf.argmax(batch_cls_targets, axis=2) flattened_class_ids = tf.reshape(class_ids, [-1]) flattened_classification_losses = tf.reshape( cls_losses, [-1]) self._summarize_anchor_classification_loss( flattened_class_ids, flattened_classification_losses) localization_loss = tf.reduce_sum(location_losses) classification_loss = tf.reduce_sum(cls_losses) # Optionally normalize by number of positive matches normalizer = tf.constant(1.0, dtype=tf.float32) if self._normalize_loss_by_num_matches: normalizer = tf.maximum( tf.to_float(tf.reduce_sum(batch_reg_weights)), 1.0) localization_loss_normalizer = normalizer if self._normalize_loc_loss_by_codesize: localization_loss_normalizer *= self._box_coder.code_size localization_loss = tf.multiply((self._localization_loss_weight / localization_loss_normalizer), localization_loss, name='localization_loss') classification_loss = tf.multiply( (self._classification_loss_weight / normalizer), classification_loss, name='classification_loss') loss_dict = { str(localization_loss.op.name): localization_loss, str(classification_loss.op.name): classification_loss } return loss_dict
def batch_multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, num_valid_boxes=None, masks=None, additional_fields=None, scope=None, parallel_iterations=32): """Multi-class version of non maximum suppression that operates on a batch. This op is similar to `multiclass_non_max_suppression` but operates on a batch of boxes and scores. See documentation for `multiclass_non_max_suppression` for details. Args: boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. scores: A [batch_size, num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of shape [batch_size, 4] where each entry is of the form [y_min, x_min, y_max, x_max] representing the window to clip boxes to before performing non-max suppression. This argument can also be a tensor of shape [4] in which case, the same clip window is applied to all images in the batch. If clip_widow is None, all boxes are used to perform non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose dimensions are [batch_size, num_anchors, ...]. scope: tf scope name. parallel_iterations: (optional) number of batch items to process in parallel. Returns: 'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [batch_size, max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [batch_size, max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [batch_size, max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) a dictionary of [batch_size, max_detections, ...] float32 tensors corresponding to the tensors specified in the input `additional_fields`. This is not returned if input `additional_fields` is None. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. Raises: ValueError: if `q` in boxes.shape is not 1 or not equal to number of classes as inferred from scores.shape. """ q = boxes.shape[2].value num_classes = scores.shape[2].value if q != 1 and q != num_classes: raise ValueError('third dimension of boxes must be either 1 or equal ' 'to the third dimension of scores') if change_coordinate_frame and clip_window is None: raise ValueError('if change_coordinate_frame is True, then a clip_window' 'must be specified.') original_masks = masks original_additional_fields = additional_fields with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): boxes_shape = boxes.shape batch_size = boxes_shape[0].value num_anchors = boxes_shape[1].value if batch_size is None: batch_size = tf.shape(boxes)[0] if num_anchors is None: num_anchors = tf.shape(boxes)[1] # If num valid boxes aren't provided, create one and mark all boxes as # valid. if num_valid_boxes is None: num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors # If masks aren't provided, create dummy masks so we can only have one copy # of _single_image_nms_fn and discard the dummy masks after map_fn. if masks is None: masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0]) masks = tf.zeros(masks_shape) if clip_window is None: clip_window = tf.stack([ tf.reduce_min(boxes[:, :, :, 0]), tf.reduce_min(boxes[:, :, :, 1]), tf.reduce_max(boxes[:, :, :, 2]), tf.reduce_max(boxes[:, :, :, 3]) ]) if clip_window.shape.ndims == 1: clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) if additional_fields is None: additional_fields = {} def _single_image_nms_fn(args): """Runs NMS on a single image and returns padded output. Args: args: A list of tensors consisting of the following: per_image_boxes - A [num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. per_image_scores - A [num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. per_image_clip_window - A 1D float32 tensor of the form [ymin, xmin, ymax, xmax] representing the window to clip the boxes to. per_image_additional_fields - (optional) A variable number of float32 tensors each with size [num_anchors, ...]. per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. Returns: 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) A variable number of float32 tensors each with size [max_detections, ...] corresponding to the input `per_image_additional_fields`. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. """ per_image_boxes = args[0] per_image_scores = args[1] per_image_masks = args[2] per_image_clip_window = args[3] per_image_additional_fields = { key: value for key, value in zip(additional_fields, args[4:-1]) } per_image_num_valid_boxes = args[-1] per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([per_image_num_valid_boxes, -1, -1, -1])), [-1, q, per_image_masks.shape[2].value, per_image_masks.shape[3].value]) if per_image_additional_fields is not None: for key, tensor in per_image_additional_fields.items(): additional_field_shape = tensor.get_shape() additional_field_dim = len(additional_field_shape) per_image_additional_fields[key] = tf.reshape( tf.slice(per_image_additional_fields[key], additional_field_dim * [0], tf.stack([per_image_num_valid_boxes] + (additional_field_dim - 1) * [-1])), [-1] + [dim.value for dim in additional_field_shape[1:]]) nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, masks=per_image_masks, additional_fields=per_image_additional_fields) padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist, max_total_size) num_detections = nmsed_boxlist.num_boxes() nmsed_boxes = padded_boxlist.get() nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores) nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes) nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks) nmsed_additional_fields = [ padded_boxlist.get_field(key) for key in per_image_additional_fields ] return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + nmsed_additional_fields + [num_detections]) num_additional_fields = 0 if additional_fields is not None: num_additional_fields = len(additional_fields) num_nmsed_outputs = 4 + num_additional_fields batch_outputs = shape_utils.static_or_dynamic_map_fn( _single_image_nms_fn, elems=([boxes, scores, masks, clip_window] + list(additional_fields.values()) + [num_valid_boxes]), dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), parallel_iterations=parallel_iterations) batch_nmsed_boxes = batch_outputs[0] batch_nmsed_scores = batch_outputs[1] batch_nmsed_classes = batch_outputs[2] batch_nmsed_masks = batch_outputs[3] batch_nmsed_additional_fields = { key: value for key, value in zip(additional_fields, batch_outputs[4:-1]) } batch_num_detections = batch_outputs[-1] if original_masks is None: batch_nmsed_masks = None if original_additional_fields is None: batch_nmsed_additional_fields = None return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, batch_nmsed_masks, batch_nmsed_additional_fields, batch_num_detections)