def _create_classification_targets(self, gt_tensor, match, tensor_type='labels'): """Creates classification targets for a single image. Args: gt_tensor: a tensor of shape [num_gt_boxes, num_class_slots] holding one-hot encoded groundtruth box labels if `tensor_type` == 'labels', Or a tensor of shape [num_gt_boxes, height, width] holding binary instance mask if `tensor_type` == 'masks'. match: a Match instance. tensor_type: string scalar, type of groundtruth tensor. Returns: cls_targets: a float tensor of shape [num_anchors, num_class_slots] if `tensor_type` == 'labels', Or a tensor of shape [num_anchors, height, width] if `tensor_type` == 'masks'. """ shape = shape_utils.combined_static_and_dynamic_shape(gt_tensor) if tensor_type == 'labels': unmatched_cls_target = self._unmatched_classification_target( shape[1]) elif tensor_type == 'masks': unmatched_cls_target = self._dummy_mask_target(shape[1], shape[2]) else: raise ValueError('Unsupported tensor type %s' % tensor_type) ignored_cls_target = unmatched_cls_target cls_targets = match.gather_based_on_match( gt_tensor, unmatched_value=unmatched_cls_target, ignored_value=ignored_cls_target) return cls_targets
def batch_decode(batch_box_encodings, anchor_boxlist_list, box_coder): """Decode a batch of box encodings w.r.t. anchors to box coordinates. Args: batch_box_encodings: a float tensor of shape [batch_size, num_anchors, num_classes, 4] holding box encoding predictions. anchors_boxlist_list: a list of BoxList instance holding float tensor of shape [num_anchors, 4] as anchor box coordinates. Lenght is equal to `batch_size`. box_coder: a BoxCoder instance to decode anchor-encoded location predictions into box coordinate predictions. Returns: decoded_boxes: a float tensor of shape [batch_size, num_anchors, num_classes, 4]. """ shape = shape_utils.combined_static_and_dynamic_shape(batch_box_encodings) box_encodings_list = [tf.reshape(box_encoding, [-1, box_coder.code_size]) for box_encoding in tf.unstack(batch_box_encodings, axis=0)] # tile anchors in the 1st dimension to `shape[2]`(i.e. num of classes) anchor_boxlist_list = [box_list.BoxList( tf.reshape(tf.tile(tf.expand_dims(anchor_boxlist.get(), 1), [1, shape[2], 1]), [-1, box_coder.code_size])) for anchor_boxlist in anchor_boxlist_list] decoded_boxes = [] for box_encodings, anchor_boxlist in zip( box_encodings_list, anchor_boxlist_list): decoded_boxes.append(box_coder.decode(box_encodings, anchor_boxlist).get()) decoded_boxes = tf.reshape(decoded_boxes, shape) return decoded_boxes
def _extract_roi_feature_maps(self, shared_feature_map, proposal_boxes): """Extracts ROI feature maps based on predicted region proposals, and resizes them to a fixed spatial dimension, followed by a 2x2 max pooling. NOTE: `tf.image.crop_and_resize` implements a variant of ROI align in which each spatial cell of the output feature map is computed using bilinear interpolation based on the nearst four spatial cells of the input feature map. Args: shared_feature_map: float tensor of shape [batch_size, height, width, depth], feature map shared by RPN and Fast RCNN. proposal_boxes: float tensor of shape [batch_size, max_num_proposals, 4], holding the decoded, nms'ed and clipped proposal box coordinates. Note that a subset of the boxes might be zero-paddings. Returns: roi_feature_maps: float tensor of shape [batch_num_proposals, height_roi, width_roi, depth], holding feature maps of regions of interest cropped and resized from the input feature map. Note that the ROIs from different images in a batch are arranged along the 0th dimension, so `batch_num_proposals` = `batch_size * max_num_proposals`. """ # e.g. # shared_feature_map: 1, ?, ?, 1024 # proposal_boxes: 1, 64, 4 shape = shape_utils.combined_static_and_dynamic_shape(proposal_boxes) proposal_boxes = tf.reshape(proposal_boxes, [shape[0] * shape[1], -1]) box_indices = tf.reshape( tf.tile(tf.expand_dims(tf.range(shape[0]), axis=1), [1, shape[1]]), [-1]) # [batch_size * max_num_proposals, crop_size, crop_size, depth] # e.g. 64, 14, 14, 1024 regions_feature_maps = tf.image.crop_and_resize( shared_feature_map, proposal_boxes, box_indices, (self._proposal_crop_size, self._proposal_crop_size)) # [batch_size * max_num_proposals, crop_size/2, crop_size/2, depth] # e.g. 64, 7, 7, 1024 roi_feature_maps = slim.max_pool2d(regions_feature_maps, kernel_size=2, stride=2) return roi_feature_maps
def balanced_subsample(indicator, sample_size, labels, pos_frac=0.5, seed=None): """Sample from a set of elements with binary labels such that the fraction of positives is at most `pos_frac`. Example: Given `indicator = [0, 1, 1, 0, 1, 0, 0, 1, 1, 1]`, `labels = [0, 1, 0, 0, 0, 0, 0, 1, 0, 1]`, `pos_frac = 0.5`, and `sample_size = 5` `indicator` indicates that elements 1, 2, 4, 7, 8, 9 are candidates. One output might be [0, 0, 1, 0, 1, 0, 0, 1, 1, 1], where 2, 4, 8 are negatives and 7, 9 are positives. so positive fraction = 2 / 5 <= 0.5 Args: indicator: bool tensor of shape [batch_size] where only True elements are to be sampled. sample_size: int scalar, num of samples to be drawn from `indicator`. labels: bool tensor of shape [batch_size], holding binary class labels. pos_frac: float scalar, fraction of positives of the entire sample. seed: int scalar, random seed. Returns: sampled_indicator: bool tensor of shape [batch_size] holding the subset sampled from the input. """ neg_indicator = tf.logical_not(labels) pos_indicator = tf.logical_and(labels, indicator) neg_indicator = tf.logical_and(neg_indicator, indicator) pos_indices = tf.reshape(tf.where(pos_indicator), [-1]) neg_indices = tf.reshape(tf.where(neg_indicator), [-1]) num_pos = int(pos_frac * sample_size) sampled_pos_indices = random_sample(pos_indices, num_pos, seed=seed) num_neg = sample_size - tf.size(sampled_pos_indices) sampled_neg_indices = random_sample(neg_indices, num_neg, seed=seed) shape = shape_utils.combined_static_and_dynamic_shape(indicator) sampled_indicator = tf.cast(tf.one_hot( tf.concat([sampled_pos_indices, sampled_neg_indices], axis=0), depth=shape[0]), tf.bool) sampled_indicator = tf.reduce_any(sampled_indicator, axis=0) return sampled_indicator
def random_sample(tensor, sample_size, seed=None): """Randomly samples `sample_size` elements from `tensor` along the 0th dimension. Or returns `tensor` as is if `sample_size` is greater than or equal to `tf.shape(tensor)[0]`. Args: tensor: any tensor with rank >= 1. sample_size: int scalar or int scalar tensor, sample size. Returns: sampled_tensor: tensor of shape [tf.minimum(sample_size, tf.shape(tensor)[0]), ...], subtensor sampled from `tensor`. """ shape = shape_utils.combined_static_and_dynamic_shape(tensor) sampled_tensor = tf.cond(tf.greater(shape[0], sample_size), lambda: tf.random_shuffle(tensor, seed=seed)[:sample_size], lambda: tensor) return sampled_tensor
def _extract_shared_feature_map(self, inputs): """Extracts the feature map shared by both RPN and Fast RCNN. Args: inputs: float tensor of shape [batch_size, height, width, depth]. Returns: shared_feature_map: float tensor of shape [batch_size, height_out, width_out, depth_out]. image_shape: a list of 4 int scalar or int scalar tensors, storing batch_size, height, width, and depth of the input tensor. """ with slim.arg_scope([slim.batch_norm], is_training=(self.is_training and not self._freeze_batch_norm)): # shared_feature_map: # [batch_size, height/output_stride, width/output_stride, depth_out] shared_feature_map = self._feature_extractor.extract_first_stage_features( inputs) image_shape = shape_utils.combined_static_and_dynamic_shape(inputs) return shared_feature_map, image_shape
def _generate_anchors(self, shared_feature_map, image_height, image_width): """Generates anchors for RPN according to spatial dimension of shared feature map. The provided image height and width are used to normalize the anchor box coordinates to the unit square (i.e. bounded within [0, 0, 1, 1]). Args: shared_feature_map: float tensor of shape [batch_size, height, width, depth], feature map shared by RPN and Fast RCNN. image_height: float scalar tensor, height of the batched input images. image_width: float scalar tensor, width of the batched input images. Returns: anchor_boxlist: BoxList instance holding `num_anchors` anchor boxes. """ shape = shape_utils.combined_static_and_dynamic_shape( shared_feature_map) anchor_boxlist = box_list_ops.concatenate( self._rpn_anchor_generator.generate([(shape[1], shape[2])], height=image_height, width=image_width)) return anchor_boxlist
def _check_validity(self, field, value, scope=None): """Checks validity of the value to be inserted to a field. If `field` == 'boxes', it must hold that ymin <= ymax and xmin <= xmax; for other fields, the 'boxes' must have already been set, and the size of their first dimension must be equal to `self.num_boxes()`. Args: field: string scalar, name of the data field. scope: string scalar, name scope. Returns: a tuple of Ops that raise `InvalidArgumentError` if the above conditions do not hold. """ with tf.name_scope(scope, 'check_validity'): if field == 'boxes': ymin, xmin, ymax, xmax = tf.unstack(value=value, axis=1) return (tf.assert_less_equal(ymin, ymax), tf.assert_less_equal(xmin, xmax)) else: return (tf.assert_equal( self.num_boxes(), shape_utils.combined_static_and_dynamic_shape(value)[0]), )
def _compute_loss(self, predictions, targets, weights): """Compute loss. Args: predictions: float tensor of shape [batch_size, num_anchors, num_classes] holding predicted logits for each class. targets: float tensor of shape [batch_size, num_anchors, num_classes] holding one-hot encoded classification targets. weights: float tensor of shape [batch_size, num_anchors], holding anchorwise weights. Returns: float tensor of shape [batch_size, num_anchors], holding the anchorwise loss. """ num_classes = shape_utils.combined_static_and_dynamic_shape( predictions)[-1] predictions = tf.divide(predictions, self._logit_scale, name='scale_logit') softmax_loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.reshape(targets, [-1, num_classes]), logits=tf.reshape(predictions, [-1, num_classes])) return tf.reshape(softmax_loss, tf.shape(weights)) * weights
def _match(self, sim_matrix): """Assign row index (argmax) to each column. Typically rows correspond to groundtruth boxes, while columns correspond to anchor boxes. Args: sim_matrix: a float tensor of shape [n, m] holding similarity scores. Returns: results: an int tensor of shape [m] holding matching results (ints >= -2) for each of `m` columns in `sim_matrix`, where `results[i] = -2` indicates `i` is ignored; `results[i] = -1` indicates `i` is unmatched (negative); `results[i] >= 0` indicates `i` is matched (positive). """ sim_matrix_shape = shape_utils.combined_static_and_dynamic_shape( sim_matrix) unmatched_indicator = -1 * tf.ones([sim_matrix_shape[1]], dtype=tf.int32) ignored_indicator = -2 * tf.ones([sim_matrix_shape[1]], dtype=tf.int32) def _match_when_rows_are_empty(): return unmatched_indicator def _match_when_rows_are_non_empty(): # Matches for each column matches = tf.argmax(sim_matrix, 0, output_type=tf.int32) # [m] matched_vals = tf.reduce_max(sim_matrix, 0) # [m] below_unmatched_thres = tf.greater( self._unmatched_thres, matched_vals) # [m] between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_thres), tf.greater(self._matched_thres, matched_vals)) # [m] if self._negatives_lower_than_unmatched: matches = tf.where(below_unmatched_thres, unmatched_indicator, matches) matches = tf.where(between_thresholds, ignored_indicator, matches) else: matches = tf.where(below_unmatched_thres, ignored_indicator, matches) matches = tf.where(between_thresholds, unmatched_indicator, matches) if self._force_match_for_each_row: force_match_column_ids = tf.argmax(sim_matrix, 1, output_type=tf.int32) force_match_column_indicators = tf.one_hot( force_match_column_ids, depth=sim_matrix_shape[1]) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where(force_match_column_mask, force_match_row_ids, matches) return final_matches else: return matches if not isinstance(sim_matrix_shape[0], tf.Tensor): results = (_match_when_rows_are_empty() if sim_matrix_shape[0] == 0 else _match_when_rows_are_non_empty()) else: results = tf.cond(tf.greater(tf.shape(sim_matrix)[0], 0), _match_when_rows_are_non_empty, _match_when_rows_are_empty) return results
def _compute_mask_loss(model, mask_predictions, batch_cls_targets, batch_msk_targets, batch_msk_weights, padding_indicator, proposal_boxlist_list): """Compute mask loss. Each proposal (out of `max_num_proposals`) predictes `num_classes` masks of shape [mask_height, mask_width]. However, only the one corresponding to the groundtruth class label `k` will be "selected" and contribute to the loss. Note: `batch_num_proposals` = `batch_size` * `max_num_proposals`, e.g. 64 = 1 * 64 Args: mask_predictions: a float tensor of shape [batch_num_proposals, num_classes, mask_height, mask_width], holding mask predictions. batch_cls_targets: a float tensor of shape [batch_size, max_num_proposals, num_classes + 1], containing anchorwise classification targets. batch_msk_targets: a float tensor of shape [bathc_size, max_num_proposals, image_height, image_width], containing instance mask targets. batch_msk_weights a float tensor of shape [batch_size, max_num_proposals], containing anchorwise localization weights. padding_indicator: a float tensor of shape [batch_size, max_num_proposals], holding indicator of padded proposals. proposal_boxlist_list: a list of BoxList instances, each holding `max_num_proposals` proposal boxes (coordinates normalized). The fields are potentially zero-padded up to `max_num_proposals`. Length of list is equal to `batch_size`. Returns: msk_loss: float scalar tensor, mask loss. """ (batch_num_proposals, num_classes, mask_height, mask_width ) = shape_utils.combined_static_and_dynamic_shape(mask_predictions) batch_size = len(proposal_boxlist_list) # [batch_size * max_num_proposals, 4] e.g. 64, 4 proposal_boxes = tf.reshape( tf.stack([ proposal_boxlist.get() for proposal_boxlist in proposal_boxlist_list ], axis=0), [batch_num_proposals, -1]) # [batch_size * max_num_proposals, nums_classes + 1, mask_height, mask_width] # e.g. 64, 91, 33, 33 mask_predictions = tf.pad(mask_predictions, [[0, 0], [1, 0], [0, 0], [0, 0]]) # Only compute mask loss for the `k`th mask prediction, where `k` is the # groundtruth # e.g. using class indices [64, 1] to gather from [64, 91, 33, 33], we get # tensor [64, 1, 33, 33] # [batch_size * max_num_proposals, 1, mask_height, mask_width] mask_predictions = tf.batch_gather( mask_predictions, tf.to_int32( tf.expand_dims(tf.argmax(tf.reshape(batch_cls_targets, [batch_num_proposals, -1]), axis=1), axis=-1))) # [batch_size, max_num_proposals, mask_height * mask_width] # e.g. 1, 64, 33 * 33 mask_predictions = tf.reshape(mask_predictions, [batch_size, -1, mask_height * mask_width]) image_height, image_width = shape_utils.combined_static_and_dynamic_shape( batch_msk_targets)[2:] # [batch_size * max_num_proposals, image_height, image_width] batch_msk_targets = tf.reshape(batch_msk_targets, [-1, image_height, image_width]) # `batch_msk_targets` contains groundtruth instance masks as FULL SIZE # images. Now we need to crop patches from it based on predicted region # proposals (i.e. `proposal_boxes`), and resize them to # [mask_height, mask_width] to match the size of `mask_predictions`. # # [batch_size * max_num_proposals, mask_height, mask_weight, 1] # e.g. 64, 33, 33, 1 batch_msk_targets = tf.image.crop_and_resize( tf.expand_dims(batch_msk_targets, -1), proposal_boxes, tf.range(batch_num_proposals), [mask_height, mask_width]) # [batch_size, max_num_proposals, mask_height * mask_width] # e.g. 1, 64, 33 * 33 batch_msk_targets = tf.reshape(batch_msk_targets, [batch_size, -1, mask_height * mask_width]) # [batch_size, max_num_proposals] msk_losses = model._frcnn_mask_loss_fn(mask_predictions, batch_msk_targets, weights=batch_msk_weights * padding_indicator) # normalize by # 1) mask size (`mask_height` * mask_width) # 2) num of pos proposals (only pos proposals' mask prediction matters) msk_losses = msk_losses / (mask_height * mask_width * tf.maximum( tf.reduce_sum(batch_msk_weights, axis=1, keep_dims=True), tf.ones((batch_size, 1)))) msk_loss = tf.reduce_sum(msk_losses) msk_loss = tf.multiply(msk_loss, model._frcnn_mask_loss_weight, name='frcnn_msk_loss') return msk_loss
def batch_multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, num_valid_boxes=None, scope=None): """Performs multiclass non maximum suppression on a batch of images. Args: boxes: float tensor of shape [batch_size, num_boxes, num_classes, 4], holding decoded box coordinates for each of the `num_classes` classes for each of `batch_size` images. scores: float tensor of shape [batch_size, num_boxes, num_classes], holding box scores for each of the `num_classes` classes for each of `batch_size` images. score_thresh: float scalar, boxes with score < `score_thresh` are removed. iou_thresh: float scalar, IOU threshold for non-max suppression. Must be in [0.0, 1.0]. max_size_per_class: int scalar, max num of retained boxes per class after NMS. max_total_size: int scalar, max num of boxes retained over all classes. clip_window: float tensor of shape [batch_size, 4], holding ymin, xmin, ymax, xmax of a window to clip boxes to before NMS. num_valid_boxes: int tensor of shape [batch_size], holding the num of valid boxes (not zero-padded) to be considered for each image in a batch. If None, all boxes in `boxes` are considered valid. scope: string scalar, scope name. Returns: batch_nmsed_boxes: float tensor of shape [batch_size, max_total_size, 4]. batch_nmsed_scores: float tensor of shape [batch_size, max_total_size]. batch_nmsed_classes: int tensor of shape [batch_size, max_total_size]. batch_num_valid_boxes: int tensor of shape [batch_size], holding num of valid (not zero-padded) boxes per image in a batch. """ with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): batch_size, num_boxes = shape_utils.combined_static_and_dynamic_shape( boxes)[:2] if num_valid_boxes is None: num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_boxes def _single_image_nms_fn(args): per_image_boxes = args[0] per_image_scores = args[1] per_image_clip_window = args[2] per_image_num_valid_boxes = args[-1] per_image_boxes = per_image_boxes[:per_image_num_valid_boxes] per_image_scores = per_image_scores[:per_image_num_valid_boxes] nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window) padded_boxlist = box_list_ops.pad_or_clip_box_list( nmsed_boxlist, max_total_size) num_boxes = nmsed_boxlist.num_boxes() nmsed_boxes = padded_boxlist.get() nmsed_scores = padded_boxlist.get_field(BoxListFields.scores) nmsed_classes = padded_boxlist.get_field(BoxListFields.classes) return nmsed_boxes, nmsed_scores, nmsed_classes, num_boxes batch_outputs = shape_utils.static_map_fn( _single_image_nms_fn, elems=[boxes, scores, clip_window, num_valid_boxes]) batch_nmsed_boxes = batch_outputs[0] batch_nmsed_scores = batch_outputs[1] batch_nmsed_classes = batch_outputs[2] batch_num_valid_boxes = batch_outputs[-1] return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, batch_num_valid_boxes)
def _predict(self, feature_map_tensor_list): """Generates the box location encoding predictions and box class score predictions. Each tensor in the output list `box_encoding_predictions_list` and `class_score_predictions_list` corresponds to a tensor in the input `feature_map_tensor_list`, and the num of anchors generated for `i`th feature map, `num_anchors_i = height_i * width_i * num_predictions_list[i]`. For example, given input feature map list of shapes [[1, 19, 19, channels_1], [1, 10, 10, channels_2], [1, 5, 5, channels_3], [1, 3, 3, channels_4], [1, 2, 2, channels_5], [1, 1, 1, channels_6]] and `num_predictions_list` = [3, 6, 6, 6, 6, 6], the output tensor lists have `num_anchors_i` = [1083, 600, 150, 54, 24, 6]. Args: feature_map_tensor_list: a list of float tensors of shape [batch_size, height_i, width_i, channels_i]. Returns: box_encoding_predictions_list: a list of float tensors of shape [batch_size, num_anchors_i, 1, 4], holding anchor-encoded box coordinate predictions (i.e. t_y, t_x, t_h, t_w). class_score_predictions_list: a list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1], holding one-hot encoded box class score predictions. """ box_encoding_predictions_list = [] class_score_predictions_list = [] num_class_slots = self._num_classes + 1 box_code_size = self._box_code_size box_predictor_scopes = [misc_utils.IdentityContextManager()] if len(feature_map_tensor_list) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(feature_map_tensor_list)) ] with slim.arg_scope(self._conv_hyperparams_fn()): # the following inner arg_scope overrides the settings in outer scope # self._conv_hyperparams_fn to make sure that the conv ops only perform # linear projections (i.e. like the output layer in the classification # network). with slim.arg_scope([slim.conv2d, slim.separable_conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): for tensor, num_predictions, box_predictor_scope in zip( feature_map_tensor_list, self._num_predictions_list, box_predictor_scopes): with box_predictor_scope: # box encoding predictions branching out of `tensor` output_size = num_predictions * box_code_size if self._use_depthwise: box_encoding_predictions = ops.split_separable_conv2d( tensor, output_size, self._kernel_size, depth_multiplier=1, stride=1, padding='SAME', scope='BoxEncodingPredictor') else: box_encoding_predictions = slim.conv2d( tensor, output_size, self._kernel_size, scope='BoxEncodingPredictor') # class score predictions branching out of `tensor` output_size = num_predictions * num_class_slots if self._use_depthwise: class_score_predictions = ops.split_separable_conv2d( tensor, output_size, self._kernel_size, depth_multiplier=1, stride=1, padding='SAME', scope='ClassPredictor') else: class_score_predictions = slim.conv2d( tensor, output_size, self._kernel_size, scope='ClassPredictor') batch, height, width, _ = ( shape_utils.combined_static_and_dynamic_shape( tensor)) box_encoding_predictions = tf.reshape( box_encoding_predictions, [ batch, height * width * num_predictions, 1, box_code_size ]) box_encoding_predictions_list.append( box_encoding_predictions) class_score_predictions = tf.reshape( class_score_predictions, [ batch, height * width * num_predictions, num_class_slots ]) class_score_predictions_list.append( class_score_predictions) return box_encoding_predictions_list, class_score_predictions_list
def num_boxes(self): """Returns an int scalar or int scalar tensor representing the num of boxes. """ return shape_utils.combined_static_and_dynamic_shape(self.get())[0]