class RCNN(object): def __init__(self, batch_size, num_point, num_channel=133, bn_decay=None, is_training=True): self.batch_size = batch_size self.num_point = num_point self.num_channel = num_channel self.bn_decay = bn_decay self.is_training = is_training self.end_points = {} self.placeholders = self.get_placeholders() self.box_encoder = BoxEncoder(CENTER_SEARCH_RANGE, NUM_CENTER_BIN, HEADING_SEARCH_RANGE, NUM_HEADING_BIN) self.build() def get_placeholders(self): batch_size = self.batch_size num_point = self.num_point num_channel = self.num_channel return { 'pointclouds': tf.placeholder(tf.float32, shape=(batch_size, num_point, num_channel)), 'proposal_boxes': tf.placeholder(tf.float32, shape=(batch_size, 7)), 'class_labels': tf.placeholder(tf.int32, shape=(batch_size,)), 'center_bin_x_labels': tf.placeholder(tf.int32, shape=(batch_size,)), 'center_bin_z_labels': tf.placeholder(tf.int32, shape=(batch_size,)), 'center_x_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)), 'center_z_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)), 'center_y_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)), 'heading_bin_labels': tf.placeholder(tf.int32, shape=(batch_size,)), 'heading_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)), 'size_class_labels': tf.placeholder(tf.int32, shape=(batch_size,)), 'size_res_labels': tf.placeholder(tf.float32, shape=(batch_size, 3)), 'gt_box_of_prop': tf.placeholder(tf.float32, shape=(batch_size, 8, 3)), 'img_inputs': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)), 'calib': tf.placeholder(tf.float32, shape=(batch_size, 3, 4)), 'train_regression': tf.placeholder(tf.bool, shape=(batch_size,)), 'img_seg_map': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 4)), 'is_training_pl': tf.placeholder(tf.bool, shape=()) } def build_img_extractor(self): self._img_pixel_size = np.asarray([360, 1200]) VGG_config = namedtuple('VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay') self._img_feature_extractor = ImgVggPyr(VGG_config(**{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) self._img_preprocessed = \ self._img_feature_extractor.preprocess_input( self.placeholders['img_inputs'], self._img_pixel_size) self.img_feature_maps, self.img_end_points = \ self._img_feature_extractor.build( self._img_preprocessed, self._img_pixel_size, self.is_training) #return self.img_feature_maps self.img_bottleneck = slim.conv2d( self.img_feature_maps, 1, [1, 1], scope='bottleneck', normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': self.is_training}) #tf.summary.image('img_feature', tf.reduce_max(self.img_bottleneck,axis=-1,keepdims=True),max_outputs=3) return self.img_bottleneck def build(self): point_cloud = self.placeholders['pointclouds'] is_training = self.placeholders['is_training_pl'] batch_size = self.batch_size # image ''' img_bottleneck = self.build_img_extractor() box2d_corners, box2d_corners_norm = projection.tf_project_to_image_space( self.placeholders['proposal_boxes'], self.placeholders['calib'], self._img_pixel_size) img_rois = tf.image.crop_and_resize( img_bottleneck, box2d_corners_norm, tf.range(0, batch_size), [16,16]) ''' seg_softmax = self.placeholders['img_seg_map'] seg_pred = tf.expand_dims(tf.argmax(seg_softmax, axis=-1), axis=-1) self._img_pixel_size = np.asarray([360, 1200]) box2d_corners, box2d_corners_norm = projection.tf_project_to_image_space( self.placeholders['proposal_boxes'], self.placeholders['calib'], self._img_pixel_size) # y1, x1, y2, x2 box2d_corners_norm_reorder = tf.stack([ tf.gather(box2d_corners_norm, 1, axis=-1), tf.gather(box2d_corners_norm, 0, axis=-1), tf.gather(box2d_corners_norm, 3, axis=-1), tf.gather(box2d_corners_norm, 2, axis=-1), ], axis=-1) img_rois = tf.image.crop_and_resize( seg_softmax, #seg_pred, box2d_corners_norm_reorder, tf.range(0, batch_size), [16,16]) self.end_points['img_rois'] = img_rois self.end_points['box2d_corners_norm_reorder'] = box2d_corners_norm_reorder l0_xyz = tf.slice(point_cloud, [0,0,0], [-1,-1,3]) l0_points = tf.slice(point_cloud, [0,0,3], [-1,-1,self.num_channel-3]) # Set abstraction layers l1_xyz, l1_points, _ = pointnet_sa_module(l0_xyz, l0_points, npoint=128, radius=0.2, nsample=64, mlp=[128,128,128], mlp2=None, group_all=False, is_training=is_training, bn_decay=self.bn_decay, scope='rcnn-sa1', bn=True) l2_xyz, l2_points, _ = pointnet_sa_module(l1_xyz, l1_points, npoint=64, radius=0.4, nsample=64, mlp=[128,128,256], mlp2=None, group_all=False, is_training=is_training, bn_decay=self.bn_decay, scope='rcnn-sa2', bn=True) l3_xyz, l3_points, _ = pointnet_sa_module(l2_xyz, l2_points, npoint=64, radius=0.4, nsample=64, mlp=[256,256,512], mlp2=None, group_all=True, is_training=is_training, bn_decay=self.bn_decay, scope='rcnn-sa3', bn=True) point_feats = tf.reshape(l3_points, [batch_size, -1]) img_feats = tf.reshape(img_rois, [batch_size, -1]) feats = tf.concat([point_feats, img_feats], axis=-1) #tf.summary.scalar('img_features', tf.reduce_mean(img_feats)) #tf.summary.scalar('point_features', tf.reduce_mean(point_feats)) # Classification cls_net = tf_util.fully_connected(img_feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay) #cls_net = tf_util.fully_connected(point_feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay) #cls_net = tf_util.fully_connected(feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay) cls_net = tf_util.dropout(cls_net, keep_prob=0.5, is_training=is_training, scope='rcnn-cls-dp1') cls_net = tf_util.fully_connected(cls_net, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc2', bn_decay=self.bn_decay) cls_net = tf_util.dropout(cls_net, keep_prob=0.5, is_training=is_training, scope='rcnn-cls-dp2') cls_net = tf_util.fully_connected(cls_net, NUM_OBJ_CLASSES, activation_fn=None, scope='rcnn-cls-fc3') self.end_points['cls_logits'] = cls_net # Box estimation cls_label_pred = tf.argmax(tf.nn.softmax(cls_net), axis=1) one_hot_pred = tf.one_hot(cls_label_pred, NUM_OBJ_CLASSES) one_hot_gt = tf.one_hot(self.placeholders['class_labels'], NUM_OBJ_CLASSES) one_hot_vec = tf.cond(is_training, lambda: one_hot_gt, lambda: one_hot_pred) est_intput = tf.concat([point_feats, one_hot_vec], axis=1) net = tf_util.fully_connected(est_intput, 512, bn=True, is_training=is_training, scope='rcnn-est-fc1', bn_decay=self.bn_decay) net = tf_util.fully_connected(net, 256, bn=True, is_training=is_training, scope='rcnn-est-fc2', bn_decay=self.bn_decay) net = tf_util.fully_connected(net, 512, bn=True, is_training=is_training, scope='rcnn-est-fc3', bn_decay=self.bn_decay) # The first NUM_CENTER_BIN*2*2: CENTER_BIN class scores and bin residuals for (x,z) # next 1: center residual for y # next NUM_HEADING_BIN*2: heading bin class scores and residuals # next NUM_SIZE_CLUSTER*4: size cluster class scores and residuals(l,w,h) output = tf_util.fully_connected(net, NUM_CENTER_BIN*2*2+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER*4, activation_fn=None, scope='rcnn-est-out') self.parse_output_to_tensors(output) self.get_output_boxes() def parse_output_to_tensors(self, output): ''' Parse batch output to separate tensors (added to end_points)''' batch_size = self.batch_size # objectness and center #end_points['objectness'] = tf.slice(output, [0,0,0], [-1,-1,2]) center_x_scores = tf.slice(output, [0,0], [-1,NUM_CENTER_BIN]) center_x_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN], [-1,NUM_CENTER_BIN]) self.end_points['center_x_scores'] = center_x_scores # (B,NUM_CENTER_BIN) self.end_points['center_x_residuals_normalized'] = \ center_x_residuals_normalized # (B,NUM_CENTER_BIN) center_z_scores = tf.slice(output, [0,NUM_CENTER_BIN*2], [-1,NUM_CENTER_BIN]) center_z_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN*3], [-1,NUM_CENTER_BIN]) self.end_points['center_z_scores'] = center_z_scores # (B,NUM_CENTER_BIN) self.end_points['center_z_residuals_normalized'] = \ center_z_residuals_normalized # (B,NUM_CENTER_BIN) self.end_points['center_y_residuals'] = tf.slice(output, [0,NUM_CENTER_BIN*4], [-1,1]) # heading heading_scores = tf.slice(output, [0,NUM_CENTER_BIN*4+1], [-1,NUM_HEADING_BIN]) heading_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN], [-1,NUM_HEADING_BIN]) self.end_points['heading_scores'] = heading_scores # (B,NUM_HEADING_BIN) self.end_points['heading_residuals_normalized'] = heading_residuals_normalized # (B,NUM_HEADING_BIN) # end_points['heading_residuals'] = \ # heading_residuals_normalized * (np.pi/NUM_HEADING_BIN) # BxNUM_HEADING_BIN # size size_scores = tf.slice(output, [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN*2], [-1,NUM_SIZE_CLUSTER]) # BxNUM_SIZE_CLUSTER size_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER], [-1,NUM_SIZE_CLUSTER*3]) size_residuals_normalized = tf.reshape(size_residuals_normalized, [batch_size, NUM_SIZE_CLUSTER, 3]) self.end_points['size_scores'] = size_scores self.end_points['size_residuals_normalized'] = size_residuals_normalized # end_points['size_residuals'] = size_residuals_normalized * \ # tf.expand_dims(tf.constant(type_mean_size, dtype=tf.float32), 0) return self.end_points def get_output_boxes(self): end_points = {} # adapt the dimension for k in ['center_x_scores', 'center_x_residuals_normalized', 'center_z_scores', 'center_z_residuals_normalized', 'center_y_residuals', 'heading_scores', 'heading_residuals_normalized', 'size_scores', 'size_residuals_normalized']: end_points[k] = tf.expand_dims(self.end_points[k], axis=1) box_center, box_angle, box_size = self.box_encoder.tf_decode(end_points) box_center = tf.squeeze(box_center, axis=1) box_center = box_center + tf.slice(self.placeholders['proposal_boxes'], [0,0], [-1,3]) box_angle = tf.squeeze(box_angle, axis=1) box_angle += tf.gather(self.placeholders['proposal_boxes'], 6, axis=-1) # resotre absoluate angle box_size = tf.squeeze(box_size, axis=1) self.end_points['box_center'] = box_center self.end_points['box_angle'] = box_angle self.end_points['box_size'] = box_size corners_3d = get_box3d_corners_helper(box_center, box_angle, box_size) self.end_points['box_corners'] = corners_3d # box score seg_scores = tf.reduce_max(tf.nn.softmax(self.end_points['cls_logits']), axis=-1) # (B,) bin_x_scores = tf.reduce_max(tf.nn.softmax(self.end_points['center_x_scores']), axis=-1) # (B,M) bin_z_scores = tf.reduce_max(tf.nn.softmax(self.end_points['center_z_scores']), axis=-1) # (B,M) heading_scores = tf.reduce_max(tf.nn.softmax(self.end_points['heading_scores']), axis=-1) # (B,M) size_scores = tf.reduce_max(tf.nn.softmax(self.end_points['size_scores']), axis=-1) # (B,M) # confidence = seg_scores + bin_x_scores + bin_z_scores + heading_scores + size_scores confidence = seg_scores * bin_x_scores * bin_z_scores * heading_scores * size_scores self.end_points['box_score'] = confidence return corners_3d def get_loss(self): end_points = self.end_points cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['cls_logits'], labels=self.placeholders['class_labels'])) tf.summary.scalar('classification loss', cls_loss) # is_obj_mask = tf.to_float(tf.not_equal(self.placeholders['class_labels'], 0)) train_reg_mask = tf.to_float(self.placeholders['train_regression']) center_x_cls_loss = tf.reduce_mean(train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['center_x_scores'], labels=self.placeholders['center_bin_x_labels'])) center_z_cls_loss = tf.reduce_mean(train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['center_z_scores'], labels=self.placeholders['center_bin_z_labels'])) bin_x_onehot = tf.one_hot(self.placeholders['center_bin_x_labels'], depth=NUM_CENTER_BIN, on_value=1, off_value=0, axis=-1) # BxNUM_CENTER_BIN # NOTICE: labels['center_x_residuals'] is already normalized center_x_residuals_normalized = tf.reduce_sum(end_points['center_x_residuals_normalized']*tf.to_float(bin_x_onehot), axis=-1) # B center_x_residuals_dist = tf.norm(self.placeholders['center_x_res_labels'] - center_x_residuals_normalized, axis=-1) center_x_res_loss = huber_loss(train_reg_mask*center_x_residuals_dist, delta=1.0) bin_z_onehot = tf.one_hot(self.placeholders['center_bin_z_labels'], depth=NUM_CENTER_BIN, on_value=1, off_value=0, axis=-1) # BxNUM_CENTER_BIN center_z_residuals_normalized = tf.reduce_sum(end_points['center_z_residuals_normalized']*tf.to_float(bin_z_onehot), axis=-1) # B center_z_residuals_dist = tf.norm(self.placeholders['center_z_res_labels'] - center_z_residuals_normalized, axis=-1) center_z_res_loss = huber_loss(train_reg_mask*center_z_residuals_dist, delta=1.0) # y is directly regressed center_y_residuals_dist = tf.norm(self.placeholders['center_y_res_labels'] - tf.gather(end_points['center_y_residuals'], 0, axis=-1), axis=-1) center_y_res_loss = huber_loss(train_reg_mask*center_y_residuals_dist, delta=1.0) tf.summary.scalar('center_x class loss', center_x_cls_loss) tf.summary.scalar('center_z class loss', center_z_cls_loss) tf.summary.scalar('center_x residual loss', center_x_res_loss) tf.summary.scalar('center_y residual loss', center_y_res_loss) tf.summary.scalar('center_z residual loss', center_z_res_loss) # Heading loss heading_class_loss = tf.reduce_mean( \ train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['heading_scores'], labels=self.placeholders['heading_bin_labels'])) hcls_onehot = tf.one_hot(self.placeholders['heading_bin_labels'], depth=NUM_HEADING_BIN, on_value=1, off_value=0, axis=-1) # BxNxNUM_HEADING_BIN heading_residual_normalized_label = self.placeholders['heading_res_labels'] heading_res_dist = tf.norm(tf.reduce_sum( \ end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=-1) - \ heading_residual_normalized_label) heading_res_loss = huber_loss(train_reg_mask*heading_res_dist, delta=1.0) tf.summary.scalar('heading class loss', heading_class_loss) tf.summary.scalar('heading residual loss', heading_res_loss) # Size loss size_class_loss = tf.reduce_mean( \ train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['size_scores'], labels=self.placeholders['size_class_labels'])) scls_onehot = tf.one_hot(self.placeholders['size_class_labels'], depth=NUM_SIZE_CLUSTER, on_value=1, off_value=0, axis=-1) # BxNUM_SIZE_CLUSTER scls_onehot_tiled = tf.tile(tf.expand_dims( \ tf.to_float(scls_onehot), -1), [1,1,3]) # BxNUM_SIZE_CLUSTERx3 predicted_size_residual_normalized = tf.reduce_sum( \ end_points['size_residuals_normalized']*scls_onehot_tiled, axis=1) # Bx3 size_residual_label_normalized = self.placeholders['size_res_labels'] # Bx3 size_dist = tf.norm(size_residual_label_normalized - predicted_size_residual_normalized, axis=-1) size_res_loss = huber_loss(train_reg_mask*size_dist, delta=1.0) tf.summary.scalar('size class loss', size_class_loss) tf.summary.scalar('size residual loss', size_res_loss) obj_cls_weight = 1 cls_weight = 1 res_weight = 1 total_loss = obj_cls_weight * cls_loss + \ cls_weight * (center_x_cls_loss + center_z_cls_loss + heading_class_loss + size_class_loss) + \ res_weight * (center_x_res_loss + center_z_res_loss + center_y_res_loss + heading_res_loss + size_res_loss) loss_endpoints = { #'size_class_loss': size_class_loss, 'size_res_loss': size_res_loss, #'heading_class_loss': heading_class_loss, #'heading_res_loss': heading_res_loss, #'center_x_cls_loss': center_x_cls_loss, #'center_z_cls_loss': center_z_cls_loss, #'center_x_res_loss': center_x_res_loss, #'center_z_res_loss': center_z_res_loss, #'center_y_res_loss': center_y_res_loss, #'mask_loss': cls_loss #'mean_size_label': mean_size_label, 'size_residuals_normalized': end_points['size_residuals_normalized'] } return total_loss, loss_endpoints
class SingleStageDetector: def __init__(self, batch_size, is_training): self.batch_size = batch_size self.is_training = is_training # placeholders self.placeholders_builder = PlaceHolders(self.batch_size) self.placeholders_builder.get_placeholders() self.placeholders = self.placeholders_builder.placeholders self.cls_list = cfg.DATASET.KITTI.CLS_LIST self.cls2idx = dict([(cls, i + 1) for i, cls in enumerate(self.cls_list)]) self.idx2cls = dict([(i + 1, cls) for i, cls in enumerate(self.cls_list)]) # anchor_builder self.anchor_builder = Anchors(0, self.cls_list) # encoder_decoder self.encoder_decoder = EncoderDecoder(0) # postprocessor self.postprocessor = PostProcessor(0, len(self.cls_list)) # loss builder self.loss_builder = LossBuilder(0) self.corner_loss = cfg.MODEL.FIRST_STAGE.CORNER_LOSS # head builder self.iou_loss = False self.heads = [] head_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.HEAD for i in range(len(head_cfg)): self.heads.append(HeadBuilder(self.batch_size, self.anchor_builder.anchors_num, 0, head_cfg[i], is_training)) if self.heads[-1].layer_type == 'IoU': self.iou_loss = True # target assigner self.target_assigner = TargetAssigner(0) # first stage self.vote_loss = False # layer builder layer_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.ARCHITECTURE layers = [] for i in range(len(layer_cfg)): layers.append(LayerBuilder(i, self.is_training, layer_cfg)) if layers[-1].layer_type == 'Vote_Layer': self.vote_loss = True self.layers = layers self.attr_velo_loss = cfg.MODEL.FIRST_STAGE.PREDICT_ATTRIBUTE_AND_VELOCITY self.__init_dict() def __init_dict(self): self.output = dict() # sampled xyz/feature self.output[maps_dict.KEY_OUTPUT_XYZ] = [] self.output[maps_dict.KEY_OUTPUT_FEATURE] = [] # generated anchors self.output[maps_dict.KEY_ANCHORS_3D] = [] # generated anchors # vote output self.output[maps_dict.PRED_VOTE_OFFSET] = [] self.output[maps_dict.PRED_VOTE_BASE] = [] # det output self.output[maps_dict.PRED_CLS] = [] self.output[maps_dict.PRED_OFFSET] = [] self.output[maps_dict.PRED_ANGLE_CLS] = [] self.output[maps_dict.PRED_ANGLE_RES] = [] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS] = [] self.output[maps_dict.PRED_ATTRIBUTE] = [] self.output[maps_dict.PRED_VELOCITY] = [] # iou output self.output[maps_dict.PRED_IOU_3D_VALUE] = [] # final result self.output[maps_dict.PRED_3D_BBOX] = [] self.output[maps_dict.PRED_3D_SCORE] = [] self.output[maps_dict.PRED_3D_CLS_CATEGORY] = [] self.output[maps_dict.PRED_3D_ATTRIBUTE] = [] self.output[maps_dict.PRED_3D_VELOCITY] = [] self.prediction_keys = self.output.keys() self.labels = dict() self.labels[maps_dict.GT_CLS] = [] self.labels[maps_dict.GT_OFFSET] = [] self.labels[maps_dict.GT_ANGLE_CLS] = [] self.labels[maps_dict.GT_ANGLE_RES] = [] self.labels[maps_dict.GT_ATTRIBUTE] = [] self.labels[maps_dict.GT_VELOCITY] = [] self.labels[maps_dict.GT_BOXES_ANCHORS_3D] = [] self.labels[maps_dict.GT_IOU_3D_VALUE] = [] self.labels[maps_dict.GT_PMASK] = [] self.labels[maps_dict.GT_NMASK] = [] self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS] = [] def build_img_extractor(self, img_input): self._img_pixel_size = np.asarray([360, 1200]) VGG_config = namedtuple('VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay') self._img_feature_extractor = ImgVggPyr(VGG_config(**{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) self._img_preprocessed = \ self._img_feature_extractor.preprocess_input(img_input, self._img_pixel_size) # self._img_preprocessed = img_input self.img_feature_maps, self.img_end_points = \ self._img_feature_extractor.build( self._img_preprocessed, self._img_pixel_size, self.is_training) #return self.img_feature_maps self.img_bottleneck = slim.conv2d( self.img_feature_maps, 128, [1, 1], #2, [1, 1], scope='bottleneck', normalizer_fn=slim.batch_norm, #normalizer_fn=None, normalizer_params={ 'is_training': self.is_training}) return self.img_bottleneck def network_forward(self, point_cloud, bn_decay, img_input): l0_xyz = tf.slice(point_cloud, [0,0,0], [-1,-1,3]) l0_points = tf.slice(point_cloud, [0,0,3], [-1,-1,-1]) num_point = l0_xyz.get_shape().as_list()[1] img_feature_maps = self.build_img_extractor(img_input) pts2d = projection.tf_rect_to_image(tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]), self.placeholders[maps_dict.PL_CALIB_P2]) pts2d = tf.cast(pts2d, tf.int32) # (B,N,2) indices = tf.concat([ tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [num_point]), axis=-1), # (B*N, 1) tf.reshape(pts2d, [self.batch_size * num_point, 2]) ], axis=-1) # (B*N,3) indices = tf.gather(indices, [0, 2, 1], axis=-1) # image's shape is (y,x) point_img_feats = tf.reshape(tf.gather_nd(img_feature_maps, indices), # (B*N,C) [self.batch_size, num_point, -1]) # (B,N,C) xyz_list, feature_list, fps_idx_list, point_img_feats_list = [l0_xyz], [l0_points], [None], [point_img_feats] for layer in self.layers: xyz_list, feature_list, fps_idx_list, point_img_feats_list = layer.build_layer(xyz_list, feature_list, fps_idx_list, bn_decay, self.output, point_img_feats_list) cur_head_start_idx = len(self.output[maps_dict.KEY_OUTPUT_XYZ]) for head in self.heads: head.build_layer(xyz_list, feature_list, bn_decay, self.output) merge_head_prediction(cur_head_start_idx, self.output, self.prediction_keys) def model_forward(self, bn_decay=None): points_input_det = self.placeholders[maps_dict.PL_POINTS_INPUT] img_input_det = self.placeholders[maps_dict.PL_IMG_INPUT] # forward the point cloud self.network_forward(points_input_det, bn_decay, img_input_det) # generate anchors base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][-1] anchors = self.anchor_builder.generate(base_xyz) # [bs, pts_num, 1/cls_num, 7] self.output[maps_dict.KEY_ANCHORS_3D].append(anchors) if self.is_training: # training mode self.train_forward(-1, anchors) else: # testing mode self.test_forward(-1, anchors) def train_forward(self, index, anchors): """ Calculating loss """ base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] gt_boxes_3d = self.placeholders[maps_dict.PL_LABEL_BOXES_3D] gt_classes = self.placeholders[maps_dict.PL_LABEL_CLASSES] gt_angle_cls = self.placeholders[maps_dict.PL_ANGLE_CLS] gt_angle_res = self.placeholders[maps_dict.PL_ANGLE_RESIDUAL] if maps_dict.PL_LABEL_ATTRIBUTES in self.placeholders.keys(): gt_attributes = self.placeholders[maps_dict.PL_LABEL_ATTRIBUTES] else: gt_attributes = None if maps_dict.PL_LABEL_VELOCITY in self.placeholders.keys(): gt_velocity = self.placeholders[maps_dict.PL_LABEL_VELOCITY] else: gt_velocity = None returned_list = self.target_assigner.assign(base_xyz, anchors, gt_boxes_3d, gt_classes, gt_angle_cls, gt_angle_res, gt_velocity, gt_attributes) assigned_idx, assigned_pmask, assigned_nmask, assigned_gt_boxes_3d, assigned_gt_labels, assigned_gt_angle_cls, assigned_gt_angle_res, assigned_gt_velocity, assigned_gt_attribute = returned_list # encode offset assigned_gt_offset, assigned_gt_angle_cls, assigned_gt_angle_res = self.encoder_decoder.encode(base_xyz, assigned_gt_boxes_3d, anchors) # corner_loss corner_loss_angle_cls = tf.cast(tf.one_hot(assigned_gt_angle_cls, depth=cfg.MODEL.ANGLE_CLS_NUM, on_value=1, off_value=0, axis=-1), tf.float32) # bs, pts_num, cls_num, -1 pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, corner_loss_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] pred_corners = transfer_box3d_to_corners(pred_anchors_3d) # [bs, points_num, cls_num, 8, 3] gt_corners = transfer_box3d_to_corners(assigned_gt_boxes_3d) # [bs, points_num, cls_num,8,3] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS].append(pred_corners) self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS].append(gt_corners) self.labels[maps_dict.GT_CLS].append(assigned_gt_labels) self.labels[maps_dict.GT_BOXES_ANCHORS_3D].append(assigned_gt_boxes_3d) self.labels[maps_dict.GT_OFFSET].append(assigned_gt_offset) self.labels[maps_dict.GT_ANGLE_CLS].append(assigned_gt_angle_cls) self.labels[maps_dict.GT_ANGLE_RES].append(assigned_gt_angle_res) self.labels[maps_dict.GT_ATTRIBUTE].append(assigned_gt_attribute) self.labels[maps_dict.GT_VELOCITY].append(assigned_gt_velocity) self.labels[maps_dict.GT_PMASK].append(assigned_pmask) self.labels[maps_dict.GT_NMASK].append(assigned_nmask) self.loss_builder.forward(index, self.labels, self.output, self.placeholders, self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss) def test_forward(self, index, anchors): base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_cls = self.output[maps_dict.PRED_CLS][index] # [bs, points_num, cls_num + 1/0] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] # decode predictions pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, pred_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] # decode classification if cfg.MODEL.FIRST_STAGE.CLS_ACTIVATION == 'Softmax': # softmax pred_score = tf.nn.softmax(pred_cls) pred_score = tf.slice(pred_score, [0, 0, 1], [-1, -1, -1]) else: # sigmoid pred_score = tf.nn.sigmoid(pred_cls) # using IoU branch proposed by sparse-to-dense if self.iou_loss: pred_iou = self.output[maps_dict.PRED_IOU_3D_VALUE][index] pred_score = pred_score * pred_iou if len(self.output[maps_dict.PRED_ATTRIBUTE]) <= 0: pred_attribute = None else: pred_attribute = self.output[maps_dict.PRED_ATTRIBUTE][index] if len(self.output[maps_dict.PRED_VELOCITY]) <= 0: pred_velocity = None else: pred_velocity = self.output[maps_dict.PRED_VELOCITY][index] self.postprocessor.forward(pred_anchors_3d, pred_score, self.output, pred_attribute, pred_velocity)
class ImgSegNet(object): """docstring for ImgSegNet.""" def __init__(self, batch_size, num_point, num_channel=4, bn_decay=None, is_training=True): self.batch_size = batch_size self.num_point = num_point self.num_channel = num_channel self.bn_decay = bn_decay self.is_training = is_training self.end_points = {} self.placeholders = self.get_placeholders() self.build() def get_placeholders(self): batch_size = self.batch_size num_point = self.num_point return { 'pointclouds': tf.placeholder(tf.float32, shape=(batch_size, num_point, self.num_channel)), 'img_inputs': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)), 'calib': tf.placeholder(tf.float32, shape=(batch_size, 3, 4)), 'seg_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'is_training_pl': tf.placeholder(tf.bool, shape=()) } def build(self): point_cloud = self.placeholders['pointclouds'] self._img_pixel_size = np.asarray([360, 1200]) bn_decay = self.bn_decay is_training = self.placeholders['is_training_pl'] VGG_config = namedtuple( 'VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay') self._img_feature_extractor = ImgVggPyr( VGG_config( **{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) self._img_preprocessed = \ self._img_feature_extractor.preprocess_input( self.placeholders['img_inputs'], self._img_pixel_size) self.img_feature_maps, self.img_end_points = \ self._img_feature_extractor.build( self._img_preprocessed, self._img_pixel_size, self.is_training) ''' self.seg_logits = slim.conv2d( self.img_feature_maps, NUM_SEG_CLASSES, [1, 1], scope='bottleneck', normalizer_fn=slim.batch_norm, #normalizer_fn=None, normalizer_params={ 'is_training': self.is_training}) ''' pts2d = projection.tf_rect_to_image( tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]), self.placeholders['calib']) pts2d = tf.cast(pts2d, tf.int32) #(B,N,2) indices = tf.concat( [ tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [self.num_point]), axis=-1), # (B*N, 1) tf.reshape(pts2d, [self.batch_size * self.num_point, 2]) ], axis=-1) # (B*N,3) indices = tf.gather(indices, [0, 2, 1], axis=-1) # image's shape is (y,x) self.end_points['point_img_feats'] = tf.reshape( tf.gather_nd(self.img_feature_maps, indices), # (B*N,C) [self.batch_size, self.num_point, -1]) # (B,N,C) net = tf_util.conv1d(self.end_points['point_img_feats'], 128, 1, padding='VALID', bn=True, is_training=is_training, scope='img-seg-conv1d-fc1', bn_decay=bn_decay) net = tf_util.dropout(net, keep_prob=0.7, is_training=is_training, scope='img-seg-dp1') logits = tf_util.conv1d(net, NUM_SEG_CLASSES, 1, padding='VALID', activation_fn=None, scope='img-seg-conv1d-fc2') self.end_points['foreground_logits'] = logits def get_seg_softmax(self): img_seg_softmax = tf.nn.softmax(self.end_points['foreground_logits'], axis=-1) return img_seg_softmax def get_loss(self): pls = self.placeholders end_points = self.end_points batch_size = self.batch_size # 3D Segmentation loss mask_loss = focal_loss( end_points['foreground_logits'], tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1)) tf.summary.scalar('mask loss', mask_loss) return mask_loss
class RPN(object): """docstring for RPN.""" def __init__(self, batch_size, num_point, num_channel=4, bn_decay=None, is_training=True): self.batch_size = batch_size self.num_point = num_point self.num_channel = num_channel self.bn_decay = bn_decay self.is_training = is_training self.end_points = {} self.box_encoder = BoxEncoder(CENTER_SEARCH_RANGE, NUM_CENTER_BIN, HEADING_SEARCH_RANGE, NUM_HEADING_BIN) self.placeholders = self.get_placeholders() self.build() def get_placeholders(self): batch_size = self.batch_size num_point = self.num_point return { 'pointclouds': tf.placeholder(tf.float32, shape=(batch_size, num_point, self.num_channel)), 'img_inputs': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)), 'calib': tf.placeholder(tf.float32, shape=(batch_size, 3, 4)), 'seg_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'center_bin_x_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'center_bin_z_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'center_x_residuals_labels': tf.placeholder(tf.float32, shape=(batch_size, num_point)), 'center_z_residuals_labels': tf.placeholder(tf.float32, shape=(batch_size, num_point)), 'center_y_residuals_labels': tf.placeholder(tf.float32, shape=(batch_size, num_point)), 'heading_bin_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'heading_residuals_labels': tf.placeholder(tf.float32, shape=(batch_size, num_point)), 'size_class_labels': tf.placeholder(tf.int32, shape=(batch_size, num_point)), 'size_residuals_labels': tf.placeholder(tf.float32, shape=(batch_size, num_point, 3)), 'gt_boxes': tf.placeholder(tf.float32, shape=(batch_size, None, 8, 3)), 'gt_box_of_point': tf.placeholder(tf.float32, shape=(batch_size, num_point, 8, 3)), 'img_seg_softmax': tf.placeholder(tf.float32, shape=(batch_size, num_point, NUM_SEG_CLASSES)), 'is_training_pl': tf.placeholder(tf.bool, shape=()) } def parse_output_to_tensors(self, output, end_points): ''' Parse batch output to separate tensors (added to end_points) Input: output: TF tensor in shape (B,N,NUM_CENTER_BIN*2*2+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER*4) end_points: dict Output: end_points: dict (updated) ''' batch_size = output.get_shape()[0].value npoints = output.get_shape()[1].value # objectness and center #end_points['objectness'] = tf.slice(output, [0,0,0], [-1,-1,2]) center_x_scores = tf.slice(output, [0, 0, 0], [-1, -1, NUM_CENTER_BIN]) center_x_residuals_normalized = tf.slice(output, [0, 0, NUM_CENTER_BIN], [-1, -1, NUM_CENTER_BIN]) end_points['center_x_scores'] = center_x_scores # (B,N,NUM_CENTER_BIN) end_points['center_x_residuals_normalized'] = \ center_x_residuals_normalized # (B,N,NUM_CENTER_BIN) center_z_scores = tf.slice(output, [0, 0, NUM_CENTER_BIN * 2], [-1, -1, NUM_CENTER_BIN]) center_z_residuals_normalized = tf.slice(output, [0, 0, NUM_CENTER_BIN * 3], [-1, -1, NUM_CENTER_BIN]) end_points['center_z_scores'] = center_z_scores # (B,N,NUM_CENTER_BIN) end_points['center_z_residuals_normalized'] = \ center_z_residuals_normalized # (B,N,NUM_CENTER_BIN) end_points['center_y_residuals'] = tf.slice(output, [0, 0, NUM_CENTER_BIN * 4], [-1, -1, 1]) # heading heading_scores = tf.slice(output, [0, 0, NUM_CENTER_BIN * 4 + 1], [-1, -1, NUM_HEADING_BIN]) heading_residuals_normalized = tf.slice( output, [0, 0, NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN], [-1, -1, NUM_HEADING_BIN]) end_points['heading_scores'] = heading_scores # (B,N,NUM_HEADING_BIN) end_points[ 'heading_residuals_normalized'] = heading_residuals_normalized # (B,N,NUM_HEADING_BIN) # end_points['heading_residuals'] = \ # heading_residuals_normalized * (np.pi/NUM_HEADING_BIN) # BxNUM_HEADING_BIN # size size_scores = tf.slice( output, [0, 0, NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN * 2], [-1, -1, NUM_SIZE_CLUSTER]) # BxNUM_SIZE_CLUSTER size_residuals_normalized = tf.slice(output, [ 0, 0, NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN * 2 + NUM_SIZE_CLUSTER ], [-1, -1, NUM_SIZE_CLUSTER * 3]) size_residuals_normalized = tf.reshape( size_residuals_normalized, [batch_size, npoints, NUM_SIZE_CLUSTER, 3]) end_points['size_scores'] = size_scores end_points['size_residuals_normalized'] = size_residuals_normalized # end_points['size_residuals'] = size_residuals_normalized * \ # tf.expand_dims(tf.constant(type_mean_size, dtype=tf.float32), 0) box_center, box_angle, box_size = self.box_encoder.tf_decode( end_points) box_center = box_center + end_points['fg_points_xyz'] box_num = batch_size * npoints corners_3d = get_box3d_corners_helper( tf.reshape(box_center, [box_num, 3]), tf.reshape(box_angle, [box_num]), tf.reshape(box_size, [box_num, 3])) end_points['proposal_boxes'] = tf.reshape(corners_3d, [batch_size, npoints, 8, 3]) return end_points def build_img_extractor(self): self._img_pixel_size = np.asarray([360, 1200]) VGG_config = namedtuple( 'VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay') self._img_feature_extractor = ImgVggPyr( VGG_config( **{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) self._img_preprocessed = \ self._img_feature_extractor.preprocess_input( self.placeholders['img_inputs'], self._img_pixel_size) self.img_feature_maps, self.img_end_points = \ self._img_feature_extractor.build( self._img_preprocessed, self._img_pixel_size, self.is_training) #return self.img_feature_maps self.img_bottleneck = slim.conv2d( self.img_feature_maps, 128, [1, 1], #2, [1, 1], scope='bottleneck', normalizer_fn=slim.batch_norm, #normalizer_fn=None, normalizer_params={'is_training': self.is_training}) return self.img_bottleneck def get_segmentation_net(self, point_cloud, is_training, bn_decay, end_points): ''' 3D instance segmentation PointNet v2 network. Input: point_cloud: TF tensor in shape (B,N,4) frustum point clouds with XYZ and intensity in point channels XYZs are in frustum coordinate is_training: TF boolean scalar bn_decay: TF float scalar end_points: dict Output: logits: TF tensor in shape (B,N,2), scores for bkg/clutter and object end_points: dict ''' l0_xyz = tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]) l0_points = tf.slice(point_cloud, [0, 0, 3], [-1, -1, NUM_CHANNEL - 3]) # Set abstraction layers l1_xyz, l1_points = pointnet_sa_module_msg( l0_xyz, l0_points, 4096, [0.1, 0.5], [16, 32], [[16, 16, 32], [32, 32, 64]], is_training, bn_decay, scope='layer1', bn=True) l2_xyz, l2_points = pointnet_sa_module_msg( l1_xyz, l1_points, 1024, [0.5, 1.0], [16, 32], [[64, 64, 128], [64, 96, 128]], is_training, bn_decay, scope='layer2', bn=True) l3_xyz, l3_points = pointnet_sa_module_msg( l2_xyz, l2_points, 256, [1.0, 2.0], [16, 32], [[128, 196, 256], [128, 196, 256]], is_training, bn_decay, scope='layer3', bn=True) l4_xyz, l4_points = pointnet_sa_module_msg( l3_xyz, l3_points, 64, [2.0, 4.0], [16, 32], [[256, 256, 512], [256, 384, 512]], is_training, bn_decay, scope='layer4', bn=True) # Feature Propagation layers l3_points = pointnet_fp_module(l3_xyz, l4_xyz, l3_points, l4_points, [512, 512], is_training, bn_decay, scope='fa_layer2', bn=True) l2_points = pointnet_fp_module(l2_xyz, l3_xyz, l2_points, l3_points, [512, 512], is_training, bn_decay, scope='fa_layer3', bn=True) l1_points = pointnet_fp_module(l1_xyz, l2_xyz, l1_points, l2_points, [256, 256], is_training, bn_decay, scope='fa_layer4', bn=True) l0_points = pointnet_fp_module(l0_xyz, l1_xyz, tf.concat([l0_xyz, l0_points], axis=-1), l1_points, [128, 128], is_training, bn_decay, scope='fa_layer5', bn=True) end_points['point_feats'] = tf.concat([l0_xyz, l0_points], axis=-1) # (B, N, 3+C1) end_points['point_feats_fuse'] = tf.concat( [end_points['point_feats'], end_points['point_img_feats']], axis=-1) # (B, N, 3+C1+C2) semantic_features = tf.concat( [l0_points, end_points['point_img_feats']], axis=-1) # (B, N, C1+C2) #end_points['point_feats_fuse'] = end_points['point_feats'] #semantic_features = l0_points # FC layers net = tf_util.dropout(semantic_features, keep_prob=0.5, is_training=is_training, scope='dp0') net = tf_util.conv1d(net, 128, 1, padding='VALID', bn=True, is_training=is_training, scope='conv1d-fc1', bn_decay=bn_decay) net = tf_util.dropout(net, keep_prob=0.7, is_training=is_training, scope='dp1') logits = tf_util.conv1d(net, NUM_SEG_CLASSES, 1, padding='VALID', activation_fn=None, scope='conv1d-fc2') end_points['foreground_logits'] = logits return end_points def reduce_proposals(self, end_points): '''Use NMS to reduce the number of proposals''' batch_size = end_points['fg_points_xyz'].shape[0] # confidence fg_logits = tf.gather_nd(end_points['foreground_logits'], end_points['fg_point_indices']) # (B,M) seg_scores = tf.reduce_max(tf.nn.softmax(fg_logits), axis=-1) # (B,M) bin_x_scores = tf.reduce_max(tf.nn.softmax( end_points['center_x_scores']), axis=-1) # (B,M) bin_z_scores = tf.reduce_max(tf.nn.softmax( end_points['center_z_scores']), axis=-1) # (B,M) heading_scores = tf.reduce_max(tf.nn.softmax( end_points['heading_scores']), axis=-1) # (B,M) size_scores = tf.reduce_max(tf.nn.softmax(end_points['size_scores']), axis=-1) # (B,M) # confidence = seg_scores + bin_x_scores + bin_z_scores + heading_scores + size_scores confidence = seg_scores * bin_x_scores * bin_z_scores * heading_scores * size_scores confidence.set_shape([batch_size, NUM_FG_POINT]) end_points['proposal_scores'] = confidence # BEV boxes boxes_3d = end_points['proposal_boxes'] # (B,M,8,3) corners_min = tf.gather(tf.reduce_min(boxes_3d, axis=2), [0, 2], axis=-1) corners_max = tf.gather(tf.reduce_max(boxes_3d, axis=2), [0, 2], axis=-1) # (B,M,2) x,z boxes_bev = tf.concat([corners_min, corners_max], axis=-1) # (B,M,4) boxes_bev.set_shape([batch_size, NUM_FG_POINT, 4]) confidence_unpack = tf.unstack(confidence, axis=0) boxes_bev_unpack = tf.unstack(boxes_bev, axis=0) #boxes_3d_unpack = tf.unstack(end_points['proposal_boxes'], axis=0) #boxes_3d_list = [] batch_nms_indices = [] for i in range(len(confidence_unpack)): nms_indices = tf.image.non_max_suppression(boxes_bev_unpack[i], confidence_unpack[i], 300) # at most 300 #boxes_3d_list.append(tf.gather(boxes_3d_unpack[i], nms_indices)) nms_indices = tf.pad( nms_indices, [[0, NUM_FG_POINT - tf.shape(nms_indices)[0]]], mode='CONSTANT', constant_values=-1) batch_nms_indices.append(nms_indices) end_points['nms_indices'] = tf.stack(batch_nms_indices, axis=0) return end_points def get_region_proposal_net(self, point_feats, is_training, bn_decay, end_points): batch_size = point_feats.get_shape()[0].value npoints = point_feats.get_shape()[1].value point_feats = tf.slice(point_feats, [0, 0, 3], [-1, -1, -1]) # (B, N, D) net = tf.reshape(point_feats, [batch_size * npoints, -1]) # Fully connected layers net = tf_util.fully_connected(net, 256, bn=True, is_training=is_training, scope='rp-fc0', bn_decay=bn_decay) #net = tf_util.dropout(net, keep_prob=0.7, # is_training=is_training, scope='rp-dp0') net = tf_util.fully_connected(net, 256, bn=True, is_training=is_training, scope='rp-fc1', bn_decay=bn_decay) #net = tf_util.dropout(net, keep_prob=0.7, # is_training=is_training, scope='rp-dp1') net = tf_util.fully_connected(net, 512, bn=True, is_training=is_training, scope='rp-fc2', bn_decay=bn_decay) #net = tf_util.dropout(net, keep_prob=0.7, # is_training=is_training, scope='rp-dp2') # The first NUM_CENTER_BIN*2*2: CENTER_BIN class scores and bin residuals for (x,z) # next 1: center residual for y # next NUM_HEADING_BIN*2: heading bin class scores and residuals # next NUM_SIZE_CLUSTER*4: size cluster class scores and residuals(l,w,h) output = tf_util.fully_connected(net, NUM_CENTER_BIN * 2 * 2 + 1 + NUM_HEADING_BIN * 2 + NUM_SIZE_CLUSTER * 4, activation_fn=None, scope='rp-fc3') end_points['proposals'] = output return output def build(self): point_cloud = self.placeholders['pointclouds'] is_training = self.placeholders['is_training_pl'] mask_label = self.placeholders['seg_labels'] bn_decay = self.bn_decay end_points = self.end_points #with tf.device('/gpu:0'): img_feature_maps = self.build_img_extractor() # (B,360,1200,C) pts2d = projection.tf_rect_to_image( tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]), self.placeholders['calib']) pts2d = tf.cast(pts2d, tf.int32) #(B,N,2) indices = tf.concat( [ tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [self.num_point]), axis=-1), # (B*N, 1) tf.reshape(pts2d, [self.batch_size * self.num_point, 2]) ], axis=-1) # (B*N,3) indices = tf.gather(indices, [0, 2, 1], axis=-1) # image's shape is (y,x) end_points['point_img_feats'] = tf.reshape( tf.gather_nd(img_feature_maps, indices), # (B*N,C) [self.batch_size, self.num_point, -1]) # (B,N,C) end_points = self.get_segmentation_net(point_cloud, is_training, bn_decay, end_points) #with tf.device('/gpu:1'): #seg_softmax = tf.nn.softmax(end_points['foreground_logits'], axis=-1) + self.placeholders['img_seg_softmax'] seg_softmax = tf.nn.softmax(end_points['foreground_logits'], axis=-1) seg_logits = tf.cond(is_training, lambda: tf.one_hot(mask_label, NUM_SEG_CLASSES), lambda: seg_softmax) #end_points['point_feats_fuse'] = tf.concat([end_points['point_feats_fuse'], seg_logits], axis=-1) # fg_point_feats include xyz fg_point_feats, end_points = point_cloud_masking( end_points['point_feats'], seg_logits, end_points, xyz_only=False) # BxNUM_FG_POINTxD proposals = self.get_region_proposal_net(fg_point_feats, is_training, bn_decay, end_points) proposals_reshaped = tf.reshape(proposals, [self.batch_size, NUM_FG_POINT, -1]) # Parse output to 3D box parameters end_points = self.parse_output_to_tensors(proposals_reshaped, end_points) end_points = self.reduce_proposals(end_points) # for iou eval end_points['gt_box_of_point'] = tf.gather_nd( self.placeholders['gt_box_of_point'], end_points['fg_point_indices']) end_points['gt_box_of_point'].set_shape( [self.batch_size, NUM_FG_POINT, 8, 3]) return end_points def get_seg_loss(self): pls = self.placeholders end_points = self.end_points batch_size = self.batch_size # 3D Segmentation loss mask_loss = focal_loss( end_points['foreground_logits'], tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1)) tf.summary.scalar('mask loss', mask_loss) return mask_loss, {} def get_loss(self): pls = self.placeholders end_points = self.end_points batch_size = self.batch_size # 3D Segmentation loss mask_loss = focal_loss( end_points['foreground_logits'], tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1)) tf.summary.scalar('mask loss', mask_loss) #return mask_loss, {} # gather box estimation labels of foreground points labels_fg = {} for k in pls.keys(): if k not in [ 'center_bin_x_labels', 'center_bin_z_labels', 'center_x_residuals_labels', 'center_z_residuals_labels', 'center_y_residuals_labels', 'heading_bin_labels', 'heading_residuals_labels', 'size_class_labels', 'size_residuals_labels', ]: continue labels_fg[k] = tf.gather_nd(pls[k], end_points['fg_point_indices']) if k == 'size_residuals_labels': labels_fg[k].set_shape([batch_size, NUM_FG_POINT, 3]) else: labels_fg[k].set_shape([batch_size, NUM_FG_POINT]) # Center loss center_x_cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['center_x_scores'], labels=labels_fg['center_bin_x_labels'])) center_z_cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=end_points['center_z_scores'], labels=labels_fg['center_bin_z_labels'])) bin_x_onehot = tf.one_hot(labels_fg['center_bin_x_labels'], depth=NUM_CENTER_BIN, on_value=1, off_value=0, axis=-1) # BxNxNUM_CENTER_BIN # NOTICE: labels['center_x_residuals'] is already normalized center_x_residuals_normalized = tf.reduce_sum( end_points['center_x_residuals_normalized'] * tf.to_float(bin_x_onehot), axis=2) # BxN center_x_residuals_dist = tf.norm( labels_fg['center_x_residuals_labels'] - center_x_residuals_normalized, axis=-1) center_x_res_loss = huber_loss(center_x_residuals_dist, delta=2.0) bin_z_onehot = tf.one_hot(labels_fg['center_bin_z_labels'], depth=NUM_CENTER_BIN, on_value=1, off_value=0, axis=-1) # BxNxNUM_CENTER_BIN center_z_residuals_normalized = tf.reduce_sum( end_points['center_z_residuals_normalized'] * tf.to_float(bin_z_onehot), axis=2) # BxN center_z_residuals_dist = tf.norm( labels_fg['center_z_residuals_labels'] - center_z_residuals_normalized, axis=-1) center_z_res_loss = huber_loss(center_z_residuals_dist, delta=2.0) # y is directly regressed center_y_residuals_dist = tf.norm( labels_fg['center_y_residuals_labels'] - tf.gather(end_points['center_y_residuals'], 0, axis=-1), axis=-1) center_y_res_loss = huber_loss(center_y_residuals_dist, delta=2.0) tf.summary.scalar('center_x class loss', center_x_cls_loss) tf.summary.scalar('center_z class loss', center_z_cls_loss) tf.summary.scalar('center_x residual loss', center_x_res_loss) tf.summary.scalar('center_y residual loss', center_y_res_loss) tf.summary.scalar('center_z residual loss', center_z_res_loss) # Heading loss heading_class_loss = tf.reduce_mean( \ tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['heading_scores'], labels=labels_fg['heading_bin_labels'])) hcls_onehot = tf.one_hot(labels_fg['heading_bin_labels'], depth=NUM_HEADING_BIN, on_value=1, off_value=0, axis=-1) # BxNxNUM_HEADING_BIN heading_residual_normalized_label = labels_fg[ 'heading_residuals_labels'] heading_res_dist = tf.norm(heading_residual_normalized_label - tf.reduce_sum( \ end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=2)) heading_res_loss = huber_loss(heading_res_dist, delta=1.0) tf.summary.scalar('heading class loss', heading_class_loss) tf.summary.scalar('heading residual loss', heading_res_loss) # Size loss size_class_loss = tf.reduce_mean( \ tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=end_points['size_scores'], labels=labels_fg['size_class_labels'])) scls_onehot = tf.one_hot(labels_fg['size_class_labels'], depth=NUM_SIZE_CLUSTER, on_value=1, off_value=0, axis=-1) # BxNxNUM_SIZE_CLUSTER scls_onehot_tiled = tf.tile(tf.expand_dims( \ tf.to_float(scls_onehot), -1), [1,1,1,3]) # BxNxNUM_SIZE_CLUSTERx3 predicted_size_residual_normalized = tf.reduce_sum( \ end_points['size_residuals_normalized']*scls_onehot_tiled, axis=2) # BxNx3 size_residual_label_normalized = labels_fg[ 'size_residuals_labels'] # BxNx3 size_dist = tf.norm(size_residual_label_normalized - predicted_size_residual_normalized, axis=-1) size_res_loss = huber_loss(size_dist, delta=1.0) tf.summary.scalar('size class loss', size_class_loss) tf.summary.scalar('size residual loss', size_res_loss) seg_weight = 0.1 cls_weight = 10 res_weight = 10 total_loss = seg_weight * mask_loss + \ cls_weight * (center_x_cls_loss + center_z_cls_loss + heading_class_loss + size_class_loss) + \ res_weight * (center_x_res_loss + center_z_res_loss + center_y_res_loss + heading_res_loss + size_res_loss) loss_endpoints = { 'size_class_loss': size_class_loss, 'size_res_loss': size_res_loss, 'heading_class_loss': heading_class_loss, 'heading_res_loss': heading_res_loss, 'center_x_cls_loss': center_x_cls_loss, 'center_z_cls_loss': center_z_cls_loss, 'center_x_res_loss': center_x_res_loss, 'center_z_res_loss': center_z_res_loss, 'center_y_res_loss': center_y_res_loss, 'mask_loss': mask_loss } return total_loss, loss_endpoints
batch_data, is_last_batch = dataset.get_next_batch(1, need_id=True) # total += np.sum(batch_data[1] == 1) # print('foreground points:', np.sum(batch_data[1] == 1)) print(batch_data['ids']) with tf.Session() as sess: img_vgg = ImgVggPyr( VGG_config( **{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) img_pixel_size = np.asarray([360, 1200]) img_preprocessed = img_vgg.preprocess_input( batch_data['images'], img_pixel_size) box2d_corners, box2d_corners_norm = projection.tf_project_to_image_space( batch_data['prop_box'], batch_data['calib'], img_pixel_size) box2d_corners_norm_reorder = tf.stack([ tf.gather(box2d_corners_norm, 1, axis=-1), tf.gather(box2d_corners_norm, 0, axis=-1), tf.gather(box2d_corners_norm, 3, axis=-1), tf.gather(box2d_corners_norm, 2, axis=-1), ], axis=-1) img_rois = tf.image.crop_and_resize( img_preprocessed, box2d_corners_norm_reorder, # reorder tf.range(0, 1), [100, 100])