def fast_rcnn_net(self, features, is_training): """ base the feature to compute the finial bbox and scores :param features:(batch_size, num_proposal, 7, 7, channels) :return: fast_rcnn_encode_boxes: (batch_size, num_proposal, num_classes*4) fast_rcnn_scores:(batch_size, num_proposal, num_classes) """ def batch_slice_fast_rcnn_net(features, config, is_training): with tf.variable_scope('fast_rcnn_net', reuse=tf.AUTO_REUSE): with slim.arg_scope( [slim.fully_connected], activation_fn=None, weights_initializer=tf.glorot_uniform_initializer(), weights_regularizer=slim.l2_regularizer( config.WEIGHT_DECAY)): batch_norm_params = { 'is_training': is_training, 'decay': 0.997, 'epsilon': 1e-5, 'scale': True, 'trainable': True, 'updates_collections': tf.GraphKeys.UPDATE_OPS, } with slim.arg_scope( [slim.conv2d], stride=1, padding="VALID", activation_fn=tf.nn.relu, weights_initializer=tf.glorot_uniform_initializer( ), normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params, weights_regularizer=slim.l2_regularizer( config.WEIGHT_DECAY)): with slim.arg_scope([slim.batch_norm], **batch_norm_params): net = slim.conv2d(inputs=features, num_outputs=1024, kernel_size=[ self.config.ROI_SIZE, config.ROI_SIZE ], scope="fc_1") net = slim.conv2d(inputs=net, num_outputs=1024, kernel_size=[1, 1], scope="fc_2") net = tf.squeeze(net, axis=[1, 2]) fast_rcnn_scores = slim.fully_connected(net, config.NUM_CLASS, scope='classifier') fast_rcnn_encode_boxes = slim.fully_connected( net, config.NUM_CLASS * 4, scope='regressor') fast_rcnn_encode_boxes = tf.reshape( fast_rcnn_encode_boxes, [-1, config.NUM_CLASS, 4]) return fast_rcnn_encode_boxes, fast_rcnn_scores fast_rcnn_encode_boxes, fast_rcnn_scores = boxes_utils.batch_slice( [features], lambda x: batch_slice_fast_rcnn_net(x, self.config, is_training), self.config.PER_GPU_IMAGE) return fast_rcnn_encode_boxes, fast_rcnn_scores
def model_fn(features, labels, mode, params): # *********************************************************************************************** # * Backbone Net * # *********************************************************************************************** net_config = params["net_config"] if mode == tf.estimator.ModeKeys.TRAIN: IS_TRAINING = True else: IS_TRAINING = False origin_image_batch = features["image"] image_batch = origin_image_batch - tf.convert_to_tensor( net_config.PIXEL_MEANS, dtype=tf.float32) image_window = features["image_window"] # there is is_training means that bn is training, so it is important! _, share_net = get_network_byname(net_name='resnet_v1_50', inputs=image_batch, num_classes=None, is_training=True, global_pool=True, output_stride=None, spatial_squeeze=True) # *********************************************************************************************** # * FPN * # *********************************************************************************************** feature_pyramid = build_fpn.build_feature_pyramid(share_net, net_config) # *********************************************************************************************** # * RPN * # *********************************************************************************************** gtboxes_and_label_batch = labels.get("gt_box_labels") rpn = build_rpn.RPN(feature_pyramid=feature_pyramid, image_window=image_window, config=net_config) # rpn_proposals_scores==(2000,) rpn_proposals_boxes, rpn_proposals_scores = rpn.rpn_proposals(IS_TRAINING) rpn_location_loss, rpn_classification_loss = rpn.rpn_losses( labels["minibatch_indices"], labels["minibatch_encode_gtboxes"], labels["minibatch_objects_one_hot"]) rpn_total_loss = rpn_classification_loss + rpn_location_loss # *********************************************************************************************** # * Fast RCNN Head * # *********************************************************************************************** fpn_fast_rcnn_head = build_head.FPNHead( feature_pyramid=feature_pyramid, rpn_proposals_boxes=rpn_proposals_boxes, origin_image=origin_image_batch, gtboxes_and_label=gtboxes_and_label_batch, config=net_config, is_training=False, image_window=image_window) detections = fpn_fast_rcnn_head.head_detection() if net_config.DEBUG: print_tensors(rpn_proposals_scores[0, :50], "scores") print_tensors(rpn_proposals_boxes[0, :50, :], "bbox") rpn_proposals_vision = draw_boxes_with_scores( origin_image_batch[0, :, :, :], rpn_proposals_boxes[0, :50, :], rpn_proposals_scores[0, :50]) head_vision = draw_boxes_with_categories_and_scores( origin_image_batch[0, :, :, :], detections[0, :, :4], detections[0, :, 4], detections[0, :, 5], net_config.LABEL_TO_NAME) tf.summary.image("rpn_proposals_vision", rpn_proposals_vision) tf.summary.image("head_vision", head_vision) head_location_loss, head_classification_loss = fpn_fast_rcnn_head.head_loss( ) head_total_loss = head_location_loss + head_classification_loss # train with tf.name_scope("regularization_losses"): regularization_list = [ tf.nn.l2_loss(w.read_value()) * net_config.WEIGHT_DECAY / tf.cast(tf.size(w.read_value()), tf.float32) for w in tf.trainable_variables() if 'gamma' not in w.name and 'beta' not in w.name ] regularization_loss = tf.add_n(regularization_list) total_loss = regularization_loss + head_total_loss + rpn_total_loss total_loss = tf.cond(tf.is_nan(total_loss), lambda: 0.0, lambda: total_loss) print_tensors(head_total_loss, "head_loss") print_tensors(rpn_total_loss, "rpn_loss") global_step = tf.train.get_or_create_global_step() tf.train.init_from_checkpoint( net_config.CHECKPOINT_DIR, {net_config.BACKBONE_NET + "/": net_config.BACKBONE_NET + "/"}) with tf.name_scope("optimizer"): lr = tf.train.piecewise_constant( global_step, boundaries=[np.int64(net_config.BOUNDARY[0])], values=[net_config.LEARNING_RATE, net_config.LEARNING_RATE / 10]) optimizer = tf.train.MomentumOptimizer(lr, momentum=net_config.MOMENTUM) optimizer = tf.contrib.estimator.TowerOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([tf.group(*update_ops)]): grads = optimizer.compute_gradients(total_loss) # clip gradients grads = tf.contrib.training.clip_gradient_norms( grads, net_config.CLIP_GRADIENT_NORM) train_op = optimizer.apply_gradients(grads, global_step) # *********************************************************************************************** # * Summary * # *********************************************************************************************** # rpn loss and image tf.summary.scalar('rpn_location_loss', rpn_location_loss, family="rpn_loss") tf.summary.scalar('rpn_classification_loss', rpn_classification_loss, family="rpn_loss") tf.summary.scalar('rpn_total_loss', rpn_total_loss, family="rpn_loss") tf.summary.scalar('head_location_loss', head_location_loss, family="head_loss") tf.summary.scalar('head_classification_loss', head_classification_loss, family="head_loss") tf.summary.scalar('head_total_loss', head_total_loss, family="head_loss") tf.summary.scalar("regularization_loss", regularization_loss) tf.summary.scalar('total_loss', total_loss) tf.summary.scalar('learning_rate', lr) meta_hook = MetadataHook(save_steps=net_config.SAVE_EVERY_N_STEP * net_config.EPOCH / 2, output_dir=net_config.MODLE_DIR) summary_hook = tf.train.SummarySaverHook( save_steps=net_config.SAVE_EVERY_N_STEP, output_dir=net_config.MODLE_DIR, summary_op=tf.summary.merge_all()) hooks = [summary_hook] if net_config.COMPUTE_TIME: hooks.append(meta_hook) if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op, training_hooks=hooks) # *********************************************************************************************** # * EVAL * # *********************************************************************************************** metric_ap_dict = batch_slice([ features["gt_box_labels"][:, :, :4], features["gt_box_labels"][:, :, 4], detections[:, :, :4], detections[:, :, 4], detections[:, :, 5] ], lambda x, y, z, u, v: compute_metric_ap(x, y, z, u, v, net_config), net_config.PER_GPU_IMAGE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=total_loss, eval_metric_ops=metric_ap_dict)
def build_frcnn_target(self): ''' when training, we should know each reference box's label and gtbox, in second stage iou >= 0.5 is object iou < 0.5 is background this function need batch_slice :return: minibatch_reference_proboxes: (batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, 4)[y1, x1, y2, x2] minibatch_encode_gtboxes:(batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, 4)[dy, dx, log(dh), log(dw)] object_mask:(batch_szie, config.FAST_RCNN_MINIBATCH_SIZE) 1 indicate is object, 0 indicate is not objects label_one_hot: (batch_szie, config.FAST_RCNN_MINIBATCH_SIZE, num_class) ''' def batch_slice_build_target(gtboxes_and_label, rpn_proposals_boxes, config): with tf.variable_scope('build_faster_rcnn_targets'): with tf.variable_scope('fast_rcnn_find_positive_negative_samples'): gtboxes = tf.cast( tf.reshape(gtboxes_and_label[:, :-1], [-1, 4]), tf.float32) gt_class_ids = tf.cast( tf.reshape(gtboxes_and_label[:, -1], [-1, ]), tf.int32) gtboxes, non_zeros = boxes_utils.trim_zeros_graph(gtboxes, name="trim_gt_box") # [M, 4] gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros) rpn_proposals_boxes, _ = boxes_utils.trim_zeros_graph(rpn_proposals_boxes, name="trim_rpn_proposal_train") ious = boxes_utils.iou_calculate(rpn_proposals_boxes, gtboxes) # [N, M] matchs = tf.cast(tf.argmax(ious, axis=1), tf.int32) # [N, ] max_iou_each_row = tf.reduce_max(ious, axis=1) positives = tf.cast(tf.greater_equal(max_iou_each_row, config.FAST_RCNN_IOU_POSITIVE_THRESHOLD), tf.int32) reference_boxes_mattached_gtboxes = tf.gather(gtboxes, matchs) # [N, 4] gt_class_ids = tf.gather(gt_class_ids, matchs) # [N, ] object_mask = tf.cast(positives, tf.float32) # [N, ] # when box is background, not caculate gradient, so give a weight 0 to avoid caculate gradient gt_class_ids = gt_class_ids * positives with tf.variable_scope('fast_rcnn_minibatch'): # choose the positive indices positive_indices = tf.reshape(tf.where(tf.not_equal(object_mask, 0.)), [-1]) num_of_positives = tf.minimum(tf.shape(positive_indices)[0], tf.cast(config.FAST_RCNN_MINIBATCH_SIZE*config.FAST_RCNN_POSITIVE_RATE, tf.int32)) positive_indices = tf.random_shuffle(positive_indices) positive_indices = tf.slice(positive_indices, begin=[0], size=[num_of_positives]) # choose the negative indices, # Strictly propose the proportion of positive and negative is 1:3 negative_indices = tf.reshape(tf.where(tf.equal(object_mask, 0.)), [-1]) num_of_negatives = tf.cast(int(1. / config.FAST_RCNN_POSITIVE_RATE) * num_of_positives, tf.int32)\ - num_of_positives num_of_negatives = tf.minimum(tf.shape(negative_indices)[0], num_of_negatives) negative_indices = tf.random_shuffle(negative_indices) negative_indices = tf.slice(negative_indices, begin=[0], size=[num_of_negatives]) minibatch_indices = tf.concat([positive_indices, negative_indices], axis=0) minibatch_reference_gtboxes = tf.gather(reference_boxes_mattached_gtboxes, minibatch_indices) minibatch_reference_proboxes = tf.gather(rpn_proposals_boxes, minibatch_indices) # encode gtboxes minibatch_encode_gtboxes = \ encode_and_decode.encode_boxes( unencode_boxes=minibatch_reference_gtboxes, reference_boxes=minibatch_reference_proboxes, dev_factors=config.BBOX_STD_DEV) object_mask = tf.gather(object_mask, minibatch_indices) gt_class_ids = tf.gather(gt_class_ids, minibatch_indices) # padding if necessary gap = tf.cast(config.FAST_RCNN_MINIBATCH_SIZE - (num_of_positives + num_of_negatives), dtype=tf.int32) bbox_padding = tf.zeros((gap, 4)) minibatch_reference_proboxes = tf.concat([minibatch_reference_proboxes, bbox_padding], axis=0) minibatch_encode_gtboxes = tf.concat([minibatch_encode_gtboxes, bbox_padding], axis=0) object_mask = tf.pad(object_mask, [(0, gap)]) gt_class_ids = tf.pad(gt_class_ids, [(0, gap)]) return minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids = \ boxes_utils.batch_slice([self.gtboxes_and_label, self.rpn_proposals_boxes], lambda x, y: batch_slice_build_target(x, y, self.config), self.config.PER_GPU_IMAGE) if DEBUG: gt_vision = draw_boxes_with_categories(self.origin_image[0], self.gtboxes_and_label[0, :, :4], self.gtboxes_and_label[0, :, 4]) tf.summary.image("gt_vision", gt_vision) draw_bbox_train = draw_boxes_with_categories(self.origin_image[0], minibatch_reference_proboxes[0], gt_class_ids[0]) tf.summary.image("positive_proposal", draw_bbox_train) return minibatch_reference_proboxes, minibatch_encode_gtboxes, object_mask, gt_class_ids
def rpn_proposals(self, is_training): """ :param is_training: :return: rpn_proposals_boxes: (batch_size, config.MAX_PROPOSAL_SIZE, 4)(y1, x1, y2, x2) """ anchors = make_anchor.generate_pyramid_anchors(self.config) if is_training: rpn_proposals_num = self.config.MAX_PROPOSAL_NUM_TRAINING else: rpn_proposals_num = self.config.MAX_PROPOSAL_NUM_INFERENCE def batch_slice_rpn_proposals(rpn_encode_boxes, rpn_scores, anchors, config, rpn_proposals_num): with tf.variable_scope('rpn_proposals'): rpn_softmax_scores = slim.softmax(rpn_scores) rpn_object_score = rpn_softmax_scores[:, 1] # second column represent object if config.RPN_TOP_K_NMS: top_k_indices = tf.nn.top_k(rpn_object_score, k=config.RPN_TOP_K_NMS).indices rpn_object_score = tf.gather(rpn_object_score, top_k_indices) rpn_encode_boxes = tf.gather(rpn_encode_boxes, top_k_indices) anchors = tf.gather(anchors, top_k_indices) rpn_decode_boxes = encode_and_decode.decode_boxes( encode_boxes=rpn_encode_boxes, reference_boxes=anchors, dev_factors=config.RPN_BBOX_STD_DEV) valid_indices = boxes_utils.non_maximal_suppression( boxes=rpn_decode_boxes, scores=rpn_object_score, max_output_size=rpn_proposals_num, iou_threshold=config.RPN_NMS_IOU_THRESHOLD) rpn_decode_boxes = tf.gather(rpn_decode_boxes, valid_indices) rpn_object_score = tf.gather(rpn_object_score, valid_indices) # clip proposals to img boundaries(replace the out boundary with image boundary) rpn_decode_boxes = boxes_utils.clip_boxes_to_img_boundaries( rpn_decode_boxes, [0, 0, config.TARGET_SIDE - 1, config.TARGET_SIDE - 1]) # Pad if needed padding = tf.maximum( rpn_proposals_num - tf.shape(rpn_decode_boxes)[0], 0) # care about why we don't use tf.pad in there zeros_padding = tf.zeros((padding, 4), dtype=tf.float32) rpn_proposals_boxes = tf.concat( [rpn_decode_boxes, zeros_padding], axis=0) rpn_object_score = tf.pad(rpn_object_score, [(0, padding)]) return rpn_proposals_boxes, rpn_object_score rpn_proposals_boxes, rpn_object_scores = \ boxes_utils.batch_slice([self.rpn_encode_boxes, self.rpn_scores], lambda x, y: batch_slice_rpn_proposals(x, y, anchors, self.config, rpn_proposals_num), self.config.PER_GPU_IMAGE) return rpn_proposals_boxes, rpn_object_scores
def fast_rcnn_proposals(self, rpn_proposal_bbox, encode_boxes, categories, scores, image_window): """ padding zeros to keep alignments :return: detection_boxes_scores_labels:(batch_size, config.MAX_DETECTION_INSTANCE, 6) """ def batch_slice_rcnn_proposals(rpn_proposal_bbox, encode_boxes, categories, scores, image_window, config): """ mutilclass NMS :param rpn_proposal_bbox: (N, 4) :param encode_boxes: (N, 4) :param categories:(N, ) :param scores: (N, ) :param image_window:(y1, x1, y2, x2) the boundary of image :return: detection_boxes_scores_labels : (-1, 6)[y1, x1, y2, x2, scores, labels] """ with tf.variable_scope('fast_rcnn_proposals'): # trim the zero graph rpn_proposal_bbox, non_zeros = boxes_utils.trim_zeros_graph(rpn_proposal_bbox, name="trim_proposals_detection") encode_boxes = tf.boolean_mask(encode_boxes, non_zeros) categories = tf.boolean_mask(categories, non_zeros) scores = tf.boolean_mask(scores, non_zeros) fast_rcnn_decode_boxes = encode_and_decode.decode_boxes(encode_boxes=encode_boxes, reference_boxes=rpn_proposal_bbox, dev_factors=config.BBOX_STD_DEV) fast_rcnn_decode_boxes = boxes_utils.clip_boxes_to_img_boundaries(fast_rcnn_decode_boxes, image_window) # remove the background keep = tf.cast(tf.where(categories > 0)[:, 0], tf.int32) if DEBUG: print_categories = tf.gather(categories, keep) print_scores = tf.gather(scores, keep) num_item = tf.minimum(tf.shape(print_scores)[0], 50) print_scores_vision, print_index = tf.nn.top_k(print_scores, k=num_item) print_categories_vision = tf.gather(print_categories, print_index) print_tensors(print_categories_vision, "categories") print_tensors(print_scores_vision, "scores") mean_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) # Filter out low confidence boxes if config.FINAL_SCORE_THRESHOLD: conf_keep = tf.cast(tf.where(scores >= config.FINAL_SCORE_THRESHOLD)[:, 0], tf.int32) keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(conf_keep, 0)) keep = tf.sparse_tensor_to_dense(keep)[0] pre_nms_class_ids = tf.gather(categories, keep) pre_nms_scores = tf.gather(scores, keep) pre_nms_rois = tf.gather(fast_rcnn_decode_boxes, keep) unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0] def nms_keep_map(class_id): """Apply Non-Maximum Suppression on ROIs of the given class.""" # Indices of ROIs of the given class ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0] # Apply NMS class_keep = tf.image.non_max_suppression( tf.gather(pre_nms_rois, ixs), tf.gather(pre_nms_scores, ixs), max_output_size=config.DETECTION_MAX_INSTANCES, iou_threshold=config.FAST_RCNN_NMS_IOU_THRESHOLD) # Map indicies class_keep = tf.gather(keep, tf.gather(ixs, class_keep)) # Pad with -1 so returned tensors have the same shape gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0] class_keep = tf.pad(class_keep, [(0, gap)], mode='CONSTANT', constant_values=-1) # Set shape so map_fn() can infer result shape class_keep.set_shape([config.DETECTION_MAX_INSTANCES]) return class_keep # 2. Map over class IDs nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids, dtype=tf.int32) # 3. Merge results into one list, and remove -1 padding nms_keep = tf.reshape(nms_keep, [-1]) nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0]) # 4. Compute intersection between keep and nms_keep keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(nms_keep, 0)) keep = tf.sparse_tensor_to_dense(keep)[0] # Keep top detections roi_count = config.DETECTION_MAX_INSTANCES class_scores_keep = tf.gather(scores, keep) num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count) top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1] keep = tf.gather(keep, top_ids) # Arrange output as [N, (y1, x1, y2, x2, class_id, score)] # Coordinates are normalized. detections = tf.concat([ tf.gather(fast_rcnn_decode_boxes, keep), tf.to_float(tf.gather(categories, keep))[..., tf.newaxis], tf.gather(scores, keep)[..., tf.newaxis] ], axis=1) # Pad with zeros if detections < DETECTION_MAX_INSTANCES gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0] detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT") return detections detections = boxes_utils.batch_slice([rpn_proposal_bbox, encode_boxes, categories, scores, image_window], lambda x, y, z, u, v: batch_slice_rcnn_proposals(x, y, z, u, v, self.config), self.config.PER_GPU_IMAGE) return detections