def test_normalized_to_image_coordinates(self): normalized_boxes = tf.placeholder(tf.float32, shape=(None, 1, 4)) normalized_boxes_np = np.array([[[0.0, 0.0, 1.0, 1.0]], [[0.5, 0.5, 1.0, 1.0]]]) image_shape = tf.convert_to_tensor([1, 4, 4, 3], dtype=tf.int32) absolute_boxes = ops.normalized_to_image_coordinates( normalized_boxes, image_shape, parallel_iterations=2) expected_boxes = np.array([[[0, 0, 4, 4]], [[2, 2, 4, 4]]]) with self.test_session() as sess: absolute_boxes = sess.run( absolute_boxes, feed_dict={normalized_boxes: normalized_boxes_np}) self.assertAllEqual(absolute_boxes, expected_boxes)
def test_normalized_to_image_coordinates(self): normalized_boxes = tf.placeholder(tf.float32, shape=(None, 1, 4)) normalized_boxes_np = np.array([[[0.0, 0.0, 1.0, 1.0]], [[0.5, 0.5, 1.0, 1.0]]]) image_shape = tf.convert_to_tensor([1, 4, 4, 3], dtype=tf.int32) absolute_boxes = ops.normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=2) expected_boxes = np.array([[[0, 0, 4, 4]], [[2, 2, 4, 4]]]) with self.test_session() as sess: absolute_boxes = sess.run(absolute_boxes, feed_dict={normalized_boxes: normalized_boxes_np}) self.assertAllEqual(absolute_boxes, expected_boxes)
def _predict_second_stage(self, rpn_box_encodings, rpn_objectness_predictions_with_background, rpn_features, anchors, image_shape, true_image_shapes): """Predicts the output tensors from 2nd stage of R-FCN. Args: rpn_box_encodings: 3-D float tensor of shape [batch_size, num_valid_anchors, self._box_coder.code_size] containing predicted boxes. rpn_objectness_predictions_with_background: 3-D float tensor of shape [batch_size, num_valid_anchors, 2] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). rpn_features: A 4-D float32 tensor with shape [batch_size, height, width, depth] representing image features from the RPN. anchors: 2-D float tensor of shape [num_anchors, self._box_coder.code_size]. image_shape: A 1D int32 tensors of size [4] containing the image shape. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) refined_box_encodings: a 3-D tensor with shape [total_num_proposals, num_classes, 4] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals 2) class_predictions_with_background: a 2-D tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 3) num_proposals: An int32 tensor of shape [batch_size] representing the number of proposals generated by the RPN. `num_proposals` allows us to keep track of which entries are to be treated as zero paddings and which are not since we always pad the number of proposals to be `self.max_num_proposals` for each image. 4) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in absolute coordinates). 5) proposal_boxes_normalized: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in normalized coordinates). Can be used to override the boxes proposed by the RPN, thus enabling one to extract box classification and prediction for externally selected areas of the image. 6) box_classifier_features: a 4-D float32 tensor, of shape [batch_size, feature_map_height, feature_map_width, depth], representing the box classifier features. """ image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0), [image_shape[0], 1]) proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn( rpn_box_encodings, rpn_objectness_predictions_with_background, anchors, image_shape_2d, true_image_shapes) box_classifier_features = ( self._feature_extractor.extract_box_classifier_features( rpn_features, scope=self.second_stage_feature_extractor_scope)) if self._rfcn_box_predictor.is_keras_model: box_predictions = self._rfcn_box_predictor( [box_classifier_features], proposal_boxes=proposal_boxes_normalized) else: box_predictions = self._rfcn_box_predictor.predict( [box_classifier_features], num_predictions_per_location=[1], scope=self.second_stage_box_predictor_scope, proposal_boxes=proposal_boxes_normalized) refined_box_encodings = tf.squeeze( tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1) class_predictions_with_background = tf.squeeze( tf.concat( box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1), axis=1) absolute_proposal_boxes = ops.normalized_to_image_coordinates( proposal_boxes_normalized, image_shape, parallel_iterations=self._parallel_iterations) prediction_dict = { 'refined_box_encodings': refined_box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'num_proposals': num_proposals, 'proposal_boxes': absolute_proposal_boxes, 'box_classifier_features': box_classifier_features, 'proposal_boxes_normalized': proposal_boxes_normalized, } return prediction_dict
def _predict_second_stage(self, rpn_box_encodings, rpn_objectness_predictions_with_background, rpn_features, anchors, image_shape): """Predicts the output tensors from 2nd stage of FasterRCNN. Args: rpn_box_encodings: 4-D float tensor of shape [batch_size, num_valid_anchors, self._box_coder.code_size] containing predicted boxes. rpn_objectness_predictions_with_background: 2-D float tensor of shape [batch_size, num_valid_anchors, 2] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). rpn_features: A 4-D float32 tensor with shape [batch_size, height, width, depth] representing image features from the RPN. anchors: 2-D float tensor of shape [num_anchors, self._box_coder.code_size]. image_shape: A 1D int32 tensors of size [4] containing the image shape. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) refined_box_encodings: a 3-D tensor with shape [total_num_proposals, num_classes, 4] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals 2) class_predictions_with_background: a 3-D tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 3) num_proposals: An int32 tensor of shape [batch_size] representing the number of proposals generated by the RPN. `num_proposals` allows us to keep track of which entries are to be treated as zero paddings and which are not since we always pad the number of proposals to be `self.max_num_proposals` for each image. 4) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in absolute coordinates). 5) proposal_boxes_normalized: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in normalized coordinates). Can be used to override the boxes proposed by the RPN, thus enabling one to extract box classification and prediction for externally selected areas of the image. 6) box_classifier_features: a 4-D float32 tensor, of shape [batch_size, feature_map_height, feature_map_width, depth], representing the box classifier features. """ proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn( rpn_box_encodings, rpn_objectness_predictions_with_background, anchors, image_shape) box_classifier_features = ( self._feature_extractor.extract_box_classifier_features( rpn_features, scope=self.second_stage_feature_extractor_scope)) box_predictions = self._rfcn_box_predictor.predict( box_classifier_features, num_predictions_per_location=1, scope=self.second_stage_box_predictor_scope, proposal_boxes=proposal_boxes_normalized) refined_box_encodings = tf.squeeze( box_predictions[box_predictor.BOX_ENCODINGS], axis=1) class_predictions_with_background = tf.squeeze( box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) absolute_proposal_boxes = ops.normalized_to_image_coordinates( proposal_boxes_normalized, image_shape, parallel_iterations=self._parallel_iterations) prediction_dict = { 'refined_box_encodings': refined_box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'num_proposals': num_proposals, 'proposal_boxes': absolute_proposal_boxes, 'box_classifier_features': box_classifier_features, 'proposal_boxes_normalized': proposal_boxes_normalized, } return prediction_dict
'Extremely hard to get through!!!!!!' b: Classification (second stage) 1. _predict_second_stage I: flattened_proposal_feature_maps = self._postprocess_rpn() 'Very complicate function!!!!!!' i: self._format_groundtruth_data(): ii: decoded_boxes = self._box_coder.decode(rpn_box_encodings, box_list.BoxList(anchors)) --> faster_rcnn_box_coder.FasterRcnnBoxCoder._decode() objectness_scores = tf.nn.softmax(rpn_objectness_predictions_with_background) iii:proposal_boxlist = post_processing.multiclass_non_max_suppression() iv: padded_proposals = box_list_ops.pad_or_clip_box_list() II: self._compute_second_stage_input_feature_maps() III: box_classifier_features = self._feature_extractor.extract_box_classifier_features() IV: box_predictions = self._mask_rcnn_box_predictor.predict() V: absolute_proposal_boxes = ops.normalized_to_image_coordinates() B: losses_dict = detection_model.loss a. _loss_rpn 1. target_assigner.batch_assign_targets() I: target_assigner.assign() i: match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes,anchors) -->sim_calc.IouSimilarity()-->box_list_ops.iou() ii: match = self._matcher.match(match_quality_matrix, **params) -->argmax_matcher.ArgMaxMatcher._match() iii: reg_targets = self._create_regression_targets(anchors,groundtruth_boxes,match) iv: cls_targets = self._create_classification_targets(groundtruth_labels,match) v: reg_weights = self._create_regression_weights(match) vi: cls_weights = self._create_classification_weights( match, self._positive_class_weight, self._negative_class_weight) 2. localization_losses = self._first_stage_localization_loss
def _predict_second_stage(self, rpn_box_encodings, rpn_objectness_predictions_with_background, rpn_features, anchors, image_shape): """Predicts the output tensors from 2nd stage of FasterRCNN. Args: rpn_box_encodings: 4-D float tensor of shape [batch_size, num_valid_anchors, self._box_coder.code_size] containing predicted boxes. rpn_objectness_predictions_with_background: 2-D float tensor of shape [batch_size, num_valid_anchors, 2] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). rpn_features: A 4-D float32 tensor with shape [batch_size, height, width, depth] representing image features from the RPN. anchors: 2-D float tensor of shape [num_anchors, self._box_coder.code_size]. image_shape: A 1D int32 tensors of size [4] containing the image shape. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) refined_box_encodings: a 3-D tensor with shape [total_num_proposals, num_classes, 4] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals 2) class_predictions_with_background: a 3-D tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 3) num_proposals: An int32 tensor of shape [batch_size] representing the number of proposals generated by the RPN. `num_proposals` allows us to keep track of which entries are to be treated as zero paddings and which are not since we always pad the number of proposals to be `self.max_num_proposals` for each image. 4) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in absolute coordinates). """ proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn( rpn_box_encodings, rpn_objectness_predictions_with_background, anchors, image_shape) box_classifier_features = ( self._feature_extractor.extract_box_classifier_features( rpn_features, scope=self.second_stage_feature_extractor_scope)) box_predictions = self._rfcn_box_predictor.predict( box_classifier_features, num_predictions_per_location=1, scope=self.second_stage_box_predictor_scope, proposal_boxes=proposal_boxes_normalized) refined_box_encodings = tf.squeeze( box_predictions[box_predictor.BOX_ENCODINGS], axis=1) class_predictions_with_background = tf.squeeze( box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) absolute_proposal_boxes = ops.normalized_to_image_coordinates( proposal_boxes_normalized, image_shape, parallel_iterations=self._parallel_iterations) prediction_dict = { 'refined_box_encodings': refined_box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'num_proposals': num_proposals, 'proposal_boxes': absolute_proposal_boxes, } return prediction_dict
def _predict_second_stage(self, rpn_box_encodings, rpn_objectness_predictions_with_background, rpn_features, anchors, image_shape, true_image_shapes): """Predicts the output tensors from 2nd stage of R-FCN. Args: rpn_box_encodings: 3-D float tensor of shape [batch_size, num_valid_anchors, self._box_coder.code_size] containing predicted boxes. rpn_objectness_predictions_with_background: 3-D float tensor of shape [batch_size, num_valid_anchors, 2] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). rpn_features: A list of single 4-D float32 tensor with shape [batch_size, height, width, depth] representing image features from the RPN. anchors: 2-D float tensor of shape [num_anchors, self._box_coder.code_size]. image_shape: A 1D int32 tensors of size [4] containing the image shape. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) refined_box_encodings: a 3-D tensor with shape [total_num_proposals, num_classes, 4] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals 2) class_predictions_with_background: a 2-D tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 3) num_proposals: An int32 tensor of shape [batch_size] representing the number of proposals generated by the RPN. `num_proposals` allows us to keep track of which entries are to be treated as zero paddings and which are not since we always pad the number of proposals to be `self.max_num_proposals` for each image. 4) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in absolute coordinates). 5) proposal_boxes_normalized: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in normalized coordinates). Can be used to override the boxes proposed by the RPN, thus enabling one to extract box classification and prediction for externally selected areas of the image. 6) box_classifier_features: a 4-D float32 tensor, of shape [batch_size, feature_map_height, feature_map_width, depth], representing the box classifier features. """ image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0), [image_shape[0], 1]) (proposal_boxes_normalized, _, _, num_proposals, _, _) = self._postprocess_rpn( rpn_box_encodings, rpn_objectness_predictions_with_background, anchors, image_shape_2d, true_image_shapes) rpn_features = rpn_features[0] box_classifier_features = ( self._extract_box_classifier_features(rpn_features)) if self._rfcn_box_predictor.is_keras_model: box_predictions = self._rfcn_box_predictor( [box_classifier_features], proposal_boxes=proposal_boxes_normalized) else: box_predictions = self._rfcn_box_predictor.predict( [box_classifier_features], num_predictions_per_location=[1], scope=self.second_stage_box_predictor_scope, proposal_boxes=proposal_boxes_normalized) refined_box_encodings = tf.squeeze(tf.concat( box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1) class_predictions_with_background = tf.squeeze(tf.concat( box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1), axis=1) absolute_proposal_boxes = ops.normalized_to_image_coordinates( proposal_boxes_normalized, image_shape, parallel_iterations=self._parallel_iterations) prediction_dict = { 'refined_box_encodings': refined_box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'num_proposals': num_proposals, 'proposal_boxes': absolute_proposal_boxes, 'box_classifier_features': box_classifier_features, 'proposal_boxes_normalized': proposal_boxes_normalized, 'final_anchors': absolute_proposal_boxes } if self._return_raw_detections_during_predict: prediction_dict.update( self._raw_detections_and_feature_map_inds( refined_box_encodings, absolute_proposal_boxes)) return prediction_dict
def _box_prediction(self, rpn_features_to_crop, proposal_boxes_normalized, image_shape, true_image_shapes, num_proposals, **side_inputs): """Predicts the output tensors from second stage of Faster R-CNN. Args: rpn_features_to_crop: A list 4-D float32 or bfloat16 tensor with shape [batch_size, height_i, width_i, depth] representing image features to crop using the proposal boxes predicted by the RPN. proposal_boxes_normalized: A float tensor with shape [batch_size, max_num_proposals, 4] representing the (potentially zero padded) proposal boxes for all images in the batch. These boxes are represented as normalized coordinates. image_shape: A 1D int32 tensors of size [4] containing the image shape. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. num_proposals: The number of valid box proposals. **side_inputs: additional tensors that are required by the network. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) refined_box_encodings: a 3-D float32 tensor with shape [total_num_proposals, num_classes, self._box_coder.code_size] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals. If using a shared box across classes the shape will instead be [total_num_proposals, 1, self._box_coder.code_size]. 2) class_predictions_with_background: a 3-D float32 tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 3) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes in absolute coordinates. 4) proposal_boxes_normalized: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes in normalized coordinates. Can be used to override the boxes proposed by the RPN, thus enabling one to extract features and get box classification and prediction for externally selected areas of the image. 5) box_classifier_features: a 4-D float32/bfloat16 tensor representing the features for each proposal. If self._return_raw_detections_during_predict is True, the dictionary will also contain: 6) raw_detection_boxes: a 4-D float32 tensor with shape [batch_size, self.max_num_proposals, num_classes, 4] in normalized coordinates. 7) raw_detection_feature_map_indices: a 3-D int32 tensor with shape [batch_size, self.max_num_proposals, num_classes]. 8) final_anchors: a 3-D float tensor of shape [batch_size, self.max_num_proposals, 4] containing the reference anchors for raw detection boxes in normalized coordinates. """ flattened_proposal_feature_maps = ( self._compute_second_stage_input_feature_maps( rpn_features_to_crop, proposal_boxes_normalized, image_shape, num_proposals, **side_inputs)) box_classifier_features = self._extract_box_classifier_features( flattened_proposal_feature_maps, num_proposals, **side_inputs) if self._mask_rcnn_box_predictor.is_keras_model: box_predictions = self._mask_rcnn_box_predictor( [box_classifier_features], prediction_stage=2) else: box_predictions = self._mask_rcnn_box_predictor.predict( [box_classifier_features], num_predictions_per_location=[1], scope=self.second_stage_box_predictor_scope, prediction_stage=2) refined_box_encodings = tf.squeeze( box_predictions[box_predictor.BOX_ENCODINGS], axis=1, name='all_refined_box_encodings') class_predictions_with_background = tf.squeeze( box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1, name='all_class_predictions_with_background') absolute_proposal_boxes = ops.normalized_to_image_coordinates( proposal_boxes_normalized, image_shape, self._parallel_iterations) prediction_dict = { 'refined_box_encodings': tf.cast(refined_box_encodings, dtype=tf.float32), 'class_predictions_with_background': tf.cast(class_predictions_with_background, dtype=tf.float32), 'proposal_boxes': absolute_proposal_boxes, 'box_classifier_features': box_classifier_features, 'proposal_boxes_normalized': proposal_boxes_normalized, 'final_anchors': proposal_boxes_normalized } if self._return_raw_detections_during_predict: prediction_dict.update( self._raw_detections_and_feature_map_inds( refined_box_encodings, absolute_proposal_boxes, true_image_shapes)) return prediction_dict