def testBasic(self): """ Test basic output of the FasterCnnNetwork """ results = self._run_network() class_prediction = results['classification_prediction'] rpn_prediction = results['rpn_prediction'] # Check that every object is defined by 4 coordinates self.assertEqual( class_prediction['objects'].shape[1], 4 ) # Check we get objects clipped to the image. self.assertAllEqual( clip_boxes(class_prediction['objects'], self.image_size), class_prediction['objects'] ) self.assertEqual( class_prediction['labels'].shape[0], class_prediction['objects'].shape[0] ) # Check that every object label is less or equal than 'num_classes' self.assertTrue( np.less_equal(class_prediction['labels'], self.config.model.network.num_classes).all() ) # Check that the sum of class probabilities is 1 self.assertAllClose( np.sum(class_prediction['rcnn']['cls_prob'], axis=1), np.ones((class_prediction['rcnn']['cls_prob'].shape[0])) ) # Check that the sum of rpn class probabilities is 1 self.assertAllClose( np.sum(rpn_prediction['rpn_cls_prob'], axis=1), np.ones((rpn_prediction['rpn_cls_prob'].shape[0])) ) # Check that every rpn proposal has 4 coordinates self.assertEqual( rpn_prediction['proposals'].shape[1], 4 ) # Check we get rpn proposals clipped to the image. self.assertAllEqual( clip_boxes(rpn_prediction['proposals'], self.image_size), rpn_prediction['proposals'] )
def testAnchors(self): """ Tests about the anchors generated by the FasterRCNN """ results = self._run_network() # Check we get anchors clipped to the image. self.assertAllEqual( clip_boxes(results['all_anchors'], self.image_size), results['all_anchors'] ) feature_map = np.random.randint(low=0, high=255, size=(1, 32, 32, 1)) config = self.config config.model.anchors.base_size = 16 config.model.anchors.scales = [0.5, 1, 2] config.model.anchors.ratios = [0.5, 1, 2] config.model.anchors.stride = 1 # image is 32 x 32 anchors = self._gen_anchors(config, feature_map.shape) # Check the amount of anchors generated is correct: # 9216 = 32^2 * config.anchor.scales * config.anchor.ratios = 1024 * 9 self.assertEqual(anchors.shape, (9216, 4)) anchor_widths = anchors[:, 2] - anchors[:, 0] anchor_heights = anchors[:, 3] - anchors[:, 1] # Since we are using equal scales and ratios, the set of unique heights # and widths must be the same. self.assertAllEqual( np.unique(anchor_widths), np.unique(anchor_heights) ) anchor_areas = anchor_widths * anchor_heights # We have 9 possible anchors areas, minus 3 repeated ones. 6 unique. self.assertAllEqual(np.unique(anchor_areas).shape[0], 6) # Check the anchors cover all the image. # TODO: Check with values calculated from config. self.assertEqual(np.min(anchors[:, 0]), -22) self.assertEqual(np.max(anchors[:, 0]), 29) self.assertEqual(np.min(anchors[:, 1]), -22) self.assertEqual(np.max(anchors[:, 1]), 29) self.assertEqual(np.min(anchors[:, 2]), 2) self.assertEqual(np.max(anchors[:, 2]), 53) self.assertEqual(np.min(anchors[:, 3]), 2) self.assertEqual(np.max(anchors[:, 3]), 53) stride = config.model.anchors.stride # Check values are sequential. self._assert_sequential_values(anchors[:, 0], stride) self._assert_sequential_values(anchors[:, 1], stride) self._assert_sequential_values(anchors[:, 2], stride) self._assert_sequential_values(anchors[:, 3], stride)
def _build(self, image, gt_boxes=None, is_training=False): """ Returns bounding boxes and classification probabilities. Args: image: A tensor with the image. Its shape should be `(height, width, 3)`. gt_boxes: A tensor with all the ground truth boxes of that image. Its shape should be `(num_gt_boxes, 5)` Where for each gt box we have (x1, y1, x2, y2, label), in that order. is_training: A boolean to whether or not it is used for training. Returns: A dictionary with the following keys: predictions: 预测结果 proposal_prediction: 关于提案的信息 A dictionary with: proposals: The proposals of the network after appling some filters like negative area; and NMS 最终保留下来的提案 proposals_label: A tensor with the label for each proposal. 对于每个提案的标签判定 proposals_label_prob: A tensor with the softmax probability for the label of each proposal. 对于每个提案的softmax概率 bbox_offsets: A tensor with the predicted bbox_offsets 预测的边界框的偏移量 class_scores: A tensor with the predicted classes scores 预测的类列得分/置信度 """ # Reshape image self.image_shape.append(3) # Add channels to shape image.set_shape(self.image_shape) image = tf.expand_dims(image, 0, name='hardcode_batch_size_to_1') # Generate feature maps from image self.feature_extractor = SSDFeatureExtractor( self._config.base_network, parent_name=self.module_name) # 获取特征图 # ques: 这里的特征图对应的是多个卷积层还是只是一个卷积层的输出 # ans: 是所有需要研究的特征图 feature_maps = self.feature_extractor(image, is_training=is_training) # Build a MultiBox predictor on top of each feature layer and collect # the bounding box offsets and the category score logits they produce bbox_offsets_list = [] class_scores_list = [] # 对于不同的输出特征图进行遍历, 进行预测 for i, feat_map in enumerate(feature_maps.values()): multibox_predictor_name = 'MultiBox_{}'.format(i) with tf.name_scope(multibox_predictor_name): # 这里使得预测的结果的数量和后面生成anchors的数量是一致的, # 并且也是对应的 num_anchors = self._anchors_per_point[i] # Predict bbox offsets # 用3x3卷积预测坐标偏移量 bbox_offsets_layer = Conv2D(num_anchors * 4, [3, 3], name=multibox_predictor_name + '_offsets_conv')(feat_map) # (HxWxnum_anchors, 4) 这里的H,W应该是和上面特征图的H,W是一致的 bbox_offsets_flattened = tf.reshape(bbox_offsets_layer, [-1, 4]) # 获取所有的预测框的偏移量 bbox_offsets_list.append(bbox_offsets_flattened) # Predict class scores # 使用3x3卷积预测类别(包含背景类) class_scores_layer = Conv2D( num_anchors * (self._num_classes + 1), [3, 3], name=multibox_predictor_name + '_classes_conv', )(feat_map) class_scores_flattened = tf.reshape( class_scores_layer, [-1, self._num_classes + 1]) # 获取所有的预测框类别判定 class_scores_list.append(class_scores_flattened) # 组合所有的预测边界框偏移量, 类别得分, 并计算对应的softmax概率 # (num_bboxes, 4) (相对于后面要生成的anchors) ########################### bbox_offsets = tf.concat(bbox_offsets_list, axis=0, name='concatenate_all_bbox_offsets') # (num_bboxes, 21) 对应得分 class_scores = tf.concat(class_scores_list, axis=0, name='concatenate_all_class_scores') # (num_bboxes, 21) 对应概率 (也是在针对后面要生成的anchors) ############### class_probabilities = tf.nn.softmax(class_scores, axis=-1, name='class_probabilities_softmax') # 这里的anchors不同于上面的预测结果, 而是根据特征图生成的参考框 ############### # Generate anchors (generated only once, therefore we use numpy) # 基于各个卷积层的特征图, 使用anchor参数, 生成所有的anchors(坐标基于特征图) raw_anchors_per_featmap = generate_raw_anchors(feature_maps, self._anchor_min_scale, self._anchor_max_scale, self._anchor_ratios, self._anchors_per_point) anchors_list = [] # 遍历所有的特征图, 将其映射到原图, 并进行剪裁 for i, (feat_map_name, feat_map) in enumerate(feature_maps.items()): # TODO: Anchor generation should be simpler. We should create # them in image scale from the start instead of scaling # them to their feature map size. # 这里的feat_map大大小应该是(num_batch, height, weight, channel) feat_map_shape = feat_map.shape.as_list()[1:3] # anchors从特征图映射到原图(坐标基于原图) scaled_bboxes = adjust_bboxes( raw_anchors_per_featmap[feat_map_name], feat_map_shape[0], feat_map_shape[1], self.image_shape[0], self.image_shape[1]) clipped_bboxes = clip_boxes(scaled_bboxes, self.image_shape) anchors_list.append(clipped_bboxes) # 将所有的anchors的原图上的坐标结果进行合并 anchors = np.concatenate(anchors_list, axis=0) anchors = tf.convert_to_tensor(anchors, dtype=tf.float32) # This is the dict we'll return after filling it with SSD's results prediction_dict = {} # Generate targets for training if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) # Generate targets target_creator = SSDTarget(self._num_classes, self._config.target, self._config.variances) # 返回各个anchor对应的类别标签(0~21),以及前景anchors位置上更新的对应真实框 # 相对自身坐标的偏移量和缩放量(其余位置为0) # 这里的类别标签, 是对那些 # 1. 在所有真实框中的最大的IoU值大于阈值的anchors, # 2. 以及那些所有真实框的最好anchors # 根据这些对应的最好的真实框来确定的, 这里使用IoU来确定的正样本的 # class_targets, 和预测值无关, 只是背景样本里用了下预测的类别概率 class_targets, bbox_offsets_targets = target_creator( class_probabilities, anchors, gt_boxes) # Filter the predictions and targets that we will ignore during # training due to hard negative mining. We use class_targets to # know which ones to ignore (they are marked as -1 if they are to # be ignored) # 确定前景anchors对应的各类数据, 包括: # 参考的anchors # 真实框 相对于anchors的偏移缩放 # anchors的对应的类别标签 # anchors对应的类别预测得分 # anchors对应的类别预测概率 # 预测出的 提案框相对于anchors的偏移缩放 # note: 这里对于每一个anchors都是预测一组偏移量, 是一一对应的, # 而且这里实际上只保留了对应与类别标签大于等于0的anchors对应的结果 # boolean_mask 只保留filter上为True的对应的数据 with tf.name_scope('hard_negative_mining_filter'): predictions_filter = tf.greater_equal(class_targets, 0) anchors = tf.boolean_mask(anchors, predictions_filter) bbox_offsets_targets = tf.boolean_mask(bbox_offsets_targets, predictions_filter) class_targets = tf.boolean_mask(class_targets, predictions_filter) class_scores = tf.boolean_mask(class_scores, predictions_filter) class_probabilities = tf.boolean_mask(class_probabilities, predictions_filter) bbox_offsets = tf.boolean_mask(bbox_offsets, predictions_filter) # Add target tensors to prediction dict # 和真实值有关系的几个数据 prediction_dict['target'] = { 'cls': class_targets, 'bbox_offsets': bbox_offsets_targets, 'anchors': anchors } # Add network's raw output to prediction dict # 和预测相关的几个数据 prediction_dict['cls_pred'] = class_scores prediction_dict['loc_pred'] = bbox_offsets # 到此为止, 得到了所有的anchors调整后的预测结果, 但是这时候的结果并没有其他的处理 # 只是在训练的时候就可以了, 但是预测或者调试输出的时候, 数据还需要进行进一步筛选 # We generate proposals when predicting, or when debug=True for # generating visualizations during training. # 进入这里的都是调整后的预测结果, 可以认为是网络的预测候选, 后续操作主要是挑选 if not is_training or self._debug: proposals_creator = SSDProposal(self._num_classes, self._config.proposals, self._config.variances) proposals = proposals_creator( class_probabilities, bbox_offsets, anchors, tf.cast(tf.shape(image)[1:3], tf.float32)) prediction_dict['classification_prediction'] = proposals # Add some non essential metrics for debugging if self._debug: prediction_dict['all_anchors'] = anchors prediction_dict['cls_prob'] = class_probabilities return prediction_dict
def _build(self, image, gt_boxes=None, is_training=False): """ Returns bounding boxes and classification probabilities. Args: image: A tensor with the image. Its shape should be `(height, width, 3)`. gt_boxes: A tensor with all the ground truth boxes of that image. Its shape should be `(num_gt_boxes, 5)` Where for each gt box we have (x1, y1, x2, y2, label), in that order. is_training: A boolean to whether or not it is used for training. Returns: A dictionary with the following keys: predictions: proposal_prediction: A dictionary with: proposals: The proposals of the network after appling some filters like negative area; and NMS proposals_label: A tensor with the label for each proposal. proposals_label_prob: A tensor with the softmax probability for the label of each proposal. bbox_offsets: A tensor with the predicted bbox_offsets class_scores: A tensor with the predicted classes scores """ # Reshape image self.image_shape.append(3) # Add channels to shape image.set_shape(self.image_shape) image = tf.expand_dims(image, 0, name='hardcode_batch_size_to_1') # Generate feature maps from image self.feature_extractor = SSDFeatureExtractor( self._config.base_network, parent_name=self.module_name) feature_maps = self.feature_extractor(image, is_training=is_training) # Build a MultiBox predictor on top of each feature layer and collect # the bounding box offsets and the category score logits they produce bbox_offsets_list = [] class_scores_list = [] for i, feat_map in enumerate(feature_maps.values()): multibox_predictor_name = 'MultiBox_{}'.format(i) with tf.name_scope(multibox_predictor_name): num_anchors = self._anchors_per_point[i] # Predict bbox offsets bbox_offsets_layer = Conv2D(num_anchors * 4, [3, 3], name=multibox_predictor_name + '_offsets_conv')(feat_map) bbox_offsets_flattened = tf.reshape(bbox_offsets_layer, [-1, 4]) bbox_offsets_list.append(bbox_offsets_flattened) # Predict class scores class_scores_layer = Conv2D( num_anchors * (self._num_classes + 1), [3, 3], name=multibox_predictor_name + '_classes_conv', )(feat_map) class_scores_flattened = tf.reshape( class_scores_layer, [-1, self._num_classes + 1]) class_scores_list.append(class_scores_flattened) bbox_offsets = tf.concat(bbox_offsets_list, axis=0, name='concatenate_all_bbox_offsets') class_scores = tf.concat(class_scores_list, axis=0, name='concatenate_all_class_scores') class_probabilities = tf.nn.softmax(class_scores, axis=-1, name='class_probabilities_softmax') # Generate anchors (generated only once, therefore we use numpy) raw_anchors_per_featmap = generate_raw_anchors(feature_maps, self._anchor_min_scale, self._anchor_max_scale, self._anchor_ratios, self._anchors_per_point) anchors_list = [] for i, (feat_map_name, feat_map) in enumerate(feature_maps.items()): # TODO: Anchor generation should be simpler. We should create # them in image scale from the start instead of scaling # them to their feature map size. feat_map_shape = feat_map.shape.as_list()[1:3] scaled_bboxes = adjust_bboxes( raw_anchors_per_featmap[feat_map_name], feat_map_shape[0], feat_map_shape[1], self.image_shape[0], self.image_shape[1]) clipped_bboxes = clip_boxes(scaled_bboxes, self.image_shape) anchors_list.append(clipped_bboxes) anchors = np.concatenate(anchors_list, axis=0) anchors = tf.convert_to_tensor(anchors, dtype=tf.float32) # This is the dict we'll return after filling it with SSD's results prediction_dict = {} # Generate targets for training if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) # Generate targets target_creator = SSDTarget(self._num_classes, self._config.target, self._config.variances) class_targets, bbox_offsets_targets = target_creator( class_probabilities, anchors, gt_boxes) # Filter the predictions and targets that we will ignore during # training due to hard negative mining. We use class_targets to # know which ones to ignore (they are marked as -1 if they are to # be ignored) with tf.name_scope('hard_negative_mining_filter'): predictions_filter = tf.greater_equal(class_targets, 0) anchors = tf.boolean_mask(anchors, predictions_filter) bbox_offsets_targets = tf.boolean_mask(bbox_offsets_targets, predictions_filter) class_targets = tf.boolean_mask(class_targets, predictions_filter) class_scores = tf.boolean_mask(class_scores, predictions_filter) class_probabilities = tf.boolean_mask(class_probabilities, predictions_filter) bbox_offsets = tf.boolean_mask(bbox_offsets, predictions_filter) # Add target tensors to prediction dict prediction_dict['target'] = { 'cls': class_targets, 'bbox_offsets': bbox_offsets_targets, 'anchors': anchors } # Add network's raw output to prediction dict prediction_dict['cls_pred'] = class_scores prediction_dict['loc_pred'] = bbox_offsets # We generate proposals when predicting, or when debug=True for # generating visualizations during training. if not is_training or self._debug: proposals_creator = SSDProposal(self._num_classes, self._config.proposals, self._config.variances) proposals = proposals_creator( class_probabilities, bbox_offsets, anchors, tf.cast(tf.shape(image)[1:3], tf.float32)) prediction_dict['classification_prediction'] = proposals # Add some non essential metrics for debugging if self._debug: prediction_dict['all_anchors'] = anchors prediction_dict['cls_prob'] = class_probabilities return prediction_dict