def get_box_targets(self, matches, det_boxes, gt_boxes): ignored_value = tf.zeros(4) unmatched_value = tf.zeros(4) default_regression_target = tf.constant([4 * [0]], tf.float32) # Ignored and unmatched (value -2 and -1, respectively) get zeros as coordinates to gather from gt_boxes_padded = tf.concat( [tf.stack([ignored_value, unmatched_value]), gt_boxes], axis=0) gather_indices = tf.maximum(matches + 2, 0) matched_gt_boxes = tf.gather(gt_boxes_padded, gather_indices) matched_box_targets = box_utils.encode_boxes( matched_gt_boxes, det_boxes, scale_factors=self.encoder_scales) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( default_regression_target, [tf.shape(matched_gt_boxes)[0], 1]) matched_anchors_mask = tf.greater_equal(matches, 0) box_targets = tf.where(matched_anchors_mask, matched_box_targets, unmatched_ignored_reg_targets) return box_targets
def build_outputs(self, features, labels, mode): is_training = mode == mode_keys.TRAIN model_outputs = {} if 'anchor_boxes' in labels: anchor_boxes = labels['anchor_boxes'] else: anchor_boxes = anchor.Anchor( self._anchor_params.min_level, self._anchor_params.max_level, self._anchor_params.num_scales, self._anchor_params.aspect_ratios, self._anchor_params.anchor_size, features.get_shape().as_list()[1:3]).multilevel_boxes backbone_features = self._backbone_fn(features, is_training) fpn_features = self._fpn_fn(backbone_features, is_training) rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn( fpn_features, is_training) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, }) rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs, anchor_boxes, labels['image_info'][:, 1, :], is_training) if is_training: rpn_rois = tf.stop_gradient(rpn_rois) # Sample proposals. rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( self._sample_rois_fn(rpn_rois, labels['gt_boxes'], labels['gt_classes'])) # Create bounding box training targets. box_targets = box_utils.encode_boxes( matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0]) # If the target is background, the box target is set to all 0s. box_targets = tf.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets) model_outputs.update({ 'class_targets': matched_gt_classes, 'box_targets': box_targets, }) roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=7) class_outputs, box_outputs = self._frcnn_head_fn( roi_features, is_training) model_outputs.update({ 'class_outputs': class_outputs, 'box_outputs': box_outputs, }) if not is_training: detection_results = self._generate_detections_fn( box_outputs, class_outputs, rpn_rois, labels['image_info'][:, 1:2, :]) model_outputs.update(detection_results) if not self._include_mask: self._log_model_statistics(features) return model_outputs if is_training: rpn_rois, classes, mask_targets = self._sample_masks_fn( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, labels['gt_masks']) mask_targets = tf.stop_gradient(mask_targets) classes = tf.cast(classes, dtype=tf.int32) model_outputs.update({ 'mask_targets': mask_targets, 'sampled_class_targets': classes, }) else: rpn_rois = detection_results['detection_boxes'] classes = tf.cast(detection_results['detection_classes'], dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=14) mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training) if is_training: model_outputs.update({ 'mask_outputs': mask_outputs, }) else: model_outputs.update( {'detection_masks': tf.nn.sigmoid(mask_outputs)}) self._log_model_statistics(features) return model_outputs
def _build_outputs(self, images, labels, mode): is_training = mode == mode_keys.TRAIN model_outputs = {} if "anchor_boxes" in labels: anchor_boxes = labels["anchor_boxes"] else: anchor_boxes = anchor.Anchor( self._params.architecture.min_level, self._params.architecture.max_level, self._params.anchor.num_scales, self._params.anchor.aspect_ratios, self._params.anchor.anchor_size, images.get_shape().as_list()[1:3], ).multilevel_boxes batch_size = tf.shape(input=images)[0] for level in anchor_boxes: anchor_boxes[level] = tf.tile( tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1]) backbone_features = self._backbone_fn(images, is_training) fpn_features = self._fpn_fn(backbone_features, is_training) rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn( fpn_features, is_training) model_outputs.update({ "rpn_score_outputs": rpn_score_outputs, "rpn_box_outputs": rpn_box_outputs, }) rpn_rois, _ = self._generate_rois_fn( rpn_box_outputs, rpn_score_outputs, anchor_boxes, labels["image_info"][:, 1, :], is_training, ) if is_training: rpn_rois = tf.stop_gradient(rpn_rois) # Sample proposals. ( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, ) = self._sample_rois_fn(rpn_rois, labels["gt_boxes"], labels["gt_classes"]) # Create bounding box training targets. box_targets = box_utils.encode_boxes( matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0]) # If the target is background, the box target is set to all 0s. box_targets = tf.compat.v1.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets, ) model_outputs.update({ "class_targets": matched_gt_classes, "box_targets": box_targets, }) roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=7) class_outputs, box_outputs = self._frcnn_head_fn( roi_features, is_training) model_outputs.update({ "class_outputs": class_outputs, "box_outputs": box_outputs, }) if not is_training: detection_results = self._generate_detections_fn( box_outputs, class_outputs, rpn_rois, labels["image_info"][:, 1:2, :]) model_outputs.update(detection_results) if not self._include_mask: return model_outputs if is_training: ( rpn_rois, classes, mask_targets, gather_nd_gt_indices, ) = self._sample_masks_fn( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, labels["gt_masks"], ) mask_targets = tf.stop_gradient(mask_targets) classes = tf.cast(classes, dtype=tf.int32) model_outputs.update({ "mask_targets": mask_targets, "sampled_class_targets": classes, }) else: rpn_rois = detection_results["detection_boxes"] classes = tf.cast(detection_results["detection_classes"], dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=14) mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training) if is_training: model_outputs.update({ "mask_outputs": mask_outputs, }) else: model_outputs.update( {"detection_masks": tf.nn.sigmoid(mask_outputs)}) if not self._include_attributes: return model_outputs attribute_outputs = self._attributes_head_fn(mask_roi_features, is_training) if is_training: attribute_targets = tf.gather_nd( labels["gt_attributes"], gather_nd_gt_indices) # [batch, K, num_attributes] model_outputs.update({ "attribute_outputs": attribute_outputs, "attribute_targets": attribute_targets, }) else: model_outputs["detection_attributes"] = tf.nn.sigmoid( attribute_outputs) return model_outputs
def _run_frcnn_head(self, fpn_features, rois, labels, is_training, model_outputs, layer_num, iou_threshold, regression_weights): """Runs the frcnn head that does both class and box prediction. Args: fpn_features: `list` of features from the fpn layer that are used to do roi pooling from the `rois`. rois: `list` of current rois that will be used to predict bbox refinement and classes from. labels: `dict` of label information. If `is_training` is used then the gt bboxes and classes are used to assign the rois their corresponding gt box and class used for computing the loss. is_training: `bool`, if model is training or being evaluated. model_outputs: `dict`, used for storing outputs used for eval and losses. layer_num: `int`, the current frcnn layer in the cascade. iou_threshold: `float`, when assigning positives/negatives based on rois, this is threshold used. regression_weights: `list`, weights used for l1 loss in bounding box regression. Returns: class_outputs: Class predictions for rois. box_outputs: Box predictions for rois. These are formatted for the regression loss and need to be converted before being used as rois in the next stage. model_outputs: Updated dict with predictions used for losses and eval. matched_gt_boxes: If `is_training` is true, then these give the gt box location of its positive match. matched_gt_classes: If `is_training` is true, then these give the gt class of the predicted box. matched_gt_boxes: If `is_training` is true, then these give the box location of its positive match. matched_gt_indices: If `is_training` is true, then gives the index of the positive box match. Used for mask prediction. rois: The sampled rois used for this layer. """ # Only used during training. matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None, None) if is_training: rois = tf.stop_gradient(rois) if layer_num == 0: # Sample proposals based on all bbox coordinates. NMS is applied here # along with sampling criteria that will make the batch have a constant # fraction of foreground to background examples. rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( self._sample_rois_fn(rois, labels['gt_boxes'], labels['gt_classes'])) else: # Since now we have a constant number of proposals we no longer # need fancier sampling that applies NMS and a fixed fg/bg ratio. rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( target_ops.assign_and_sample_proposals( rois, labels['gt_boxes'], labels['gt_classes'], num_samples_per_image=self._num_roi_samples, mix_gt_boxes=False, fg_iou_thresh=iou_threshold, bg_iou_thresh_hi=iou_threshold, bg_iou_thresh_lo=0.0, skip_subsampling=True)) self.add_scalar_summary( 'fg_bg_ratio_{}'.format(layer_num), tf.reduce_mean( tf.cast(tf.greater(matched_gt_classes, 0), rois.dtype))) # Create bounding box training targets. box_targets = box_utils.encode_boxes(matched_gt_boxes, rois, weights=regression_weights) # If the target is background, the box target is set to all 0s. box_targets = tf.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets) model_outputs.update({ 'class_targets_{}'.format(layer_num): matched_gt_classes, 'box_targets_{}'.format(layer_num): box_targets, }) # Get roi features. roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rois, output_size=7) # Run frcnn head to get class and bbox predictions. with tf.variable_scope('frcnn_layer_{}'.format(layer_num)): class_outputs, box_outputs = self._frcnn_head_fn( roi_features, is_training) model_outputs.update({ 'class_outputs_{}'.format(layer_num): class_outputs, 'box_outputs_{}'.format(layer_num): box_outputs, }) return (class_outputs, box_outputs, model_outputs, matched_gt_boxes, matched_gt_classes, matched_gt_indices, rois)
def _build_outputs(self, images, labels, mode): is_training = mode == mode_keys.TRAIN model_outputs = {} if 'anchor_boxes' in labels: anchor_boxes = labels['anchor_boxes'] else: anchor_boxes = anchor.Anchor( self._params.architecture.min_level, self._params.architecture.max_level, self._params.anchor.num_scales, self._params.anchor.aspect_ratios, self._params.anchor.anchor_size, images.get_shape().as_list()[1:3]).multilevel_boxes batch_size = tf.shape(images)[0] for level in anchor_boxes: anchor_boxes[level] = tf.tile( tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1]) backbone_features = self._backbone_fn(images, is_training) fpn_features = self._fpn_fn(backbone_features, is_training) rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn( fpn_features, is_training) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, }) rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs, anchor_boxes, labels['image_info'][:, 1, :], is_training) if is_training: rpn_rois = tf.stop_gradient(rpn_rois) # Sample proposals. rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( self._sample_rois_fn(rpn_rois, labels['gt_boxes'], labels['gt_classes'])) self.add_scalar_summary( 'fg_bg_ratio_{}'.format(0), tf.reduce_sum( tf.cast(tf.greater(matched_gt_classes, 0), tf.float32)) / tf.reduce_sum( tf.cast(tf.greater_equal(matched_gt_classes, 0), tf.float32))) # Create bounding box training targets. box_targets = box_utils.encode_boxes( matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0]) # If the target is background, the box target is set to all 0s. box_targets = tf.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets) model_outputs.update({ 'class_targets': matched_gt_classes, 'box_targets': box_targets, }) _, num_rois_before_cat, _ = rpn_rois.get_shape().as_list() if is_training and self._feat_distill: tf.logging.info(f'rois before concat distill boxes: {rpn_rois}') rpn_rois = tf.concat([rpn_rois, labels['roi_boxes']], axis=1) # [batch_size, num_rois+max_distill_rois, 4] tf.logging.info(f'rois after concat distill boxes: {rpn_rois}') roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=7) if is_training and self._feat_distill: tf.logging.info(f'rois before split: {rpn_rois}') rpn_rois, _ = tf.split( rpn_rois, [num_rois_before_cat, self._max_distill_rois], axis=1) tf.logging.info(f'rois after split: {rpn_rois}') (class_outputs, box_outputs, distill_feat_outputs, distill_class_outputs) = self._frcnn_head_fn(roi_features, is_training) model_outputs.update({ 'class_outputs': class_outputs, 'box_outputs': box_outputs, }) if is_training and self._feat_distill: model_outputs.update( {'distill_feat_outputs': distill_feat_outputs}) if not is_training: detection_results = self._generate_detections_fn( box_outputs, class_outputs, rpn_rois, labels['image_info'][:, 1:2, :], bbox_per_class=not self._params.frcnn_head. class_agnostic_bbox_pred, distill_class_outputs=distill_class_outputs, ) model_outputs.update(detection_results) if not self._include_mask: return model_outputs if is_training: rpn_rois, classes, mask_targets = self._sample_masks_fn( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, labels['gt_masks']) mask_targets = tf.stop_gradient(mask_targets) classes = tf.cast(classes, dtype=tf.int32) model_outputs.update({ 'mask_targets': mask_targets, 'sampled_class_targets': classes, }) else: rpn_rois = detection_results['detection_boxes'] classes = tf.cast(detection_results['detection_classes'], dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=14) mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training) if is_training: model_outputs.update({ 'mask_outputs': mask_outputs, }) else: model_outputs.update( {'detection_masks': tf.nn.sigmoid(mask_outputs)}) return model_outputs
def __call__(self, proposals, boxes): return box_utils.encode_boxes(boxes, proposals, self.weights)
def build_outputs(self, inputs, mode): is_training = mode == mode_keys.TRAIN model_outputs = {} image = inputs['image'] _, image_height, image_width, _ = image.get_shape().as_list() backbone_features = self._backbone_fn(image, is_training) fpn_features = self._fpn_fn(backbone_features, is_training) rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn( fpn_features, is_training) model_outputs.update({ 'rpn_score_outputs': tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), rpn_score_outputs), 'rpn_box_outputs': tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), rpn_box_outputs), }) input_anchor = anchor.Anchor(self._params.anchor.min_level, self._params.anchor.max_level, self._params.anchor.num_scales, self._params.anchor.aspect_ratios, self._params.anchor.anchor_size, (image_height, image_width)) rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs, input_anchor.multilevel_boxes, inputs['image_info'][:, 1, :], is_training) if is_training: rpn_rois = tf.stop_gradient(rpn_rois) # Sample proposals. rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( self._sample_rois_fn(rpn_rois, inputs['gt_boxes'], inputs['gt_classes'])) # Create bounding box training targets. box_targets = box_utils.encode_boxes( matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0]) # If the target is background, the box target is set to all 0s. box_targets = tf.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets) model_outputs.update({ 'class_targets': matched_gt_classes, 'box_targets': box_targets, }) roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=7) class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training) model_outputs.update({ 'class_outputs': tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), class_outputs), 'box_outputs': tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), box_outputs), }) # Add this output to train to make the checkpoint loadable in predict mode. # If we skip it in train mode, the heads will be out-of-order and checkpoint # loading will fail. boxes, scores, classes, valid_detections = self._generate_detections_fn( box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :]) model_outputs.update({ 'num_detections': valid_detections, 'detection_boxes': boxes, 'detection_classes': classes, 'detection_scores': scores, }) if not self._include_mask: return model_outputs if is_training: rpn_rois, classes, mask_targets = self._sample_masks_fn( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, inputs['gt_masks']) mask_targets = tf.stop_gradient(mask_targets) classes = tf.cast(classes, dtype=tf.int32) model_outputs.update({ 'mask_targets': mask_targets, 'sampled_class_targets': classes, }) else: rpn_rois = boxes classes = tf.cast(classes, dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=14) mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training) if is_training: model_outputs.update({ 'mask_outputs': tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), mask_outputs), }) else: model_outputs.update({ 'detection_masks': tf.nn.sigmoid(mask_outputs) }) return model_outputs