def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Linear(2048, self.n_classes * 2048) # self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.class_agnostic: # self.bottle_neck = nn.Sequential( # nn.Linear(2048, 512), # nn.BatchNorm2d(512), # nn.ReLU(inplace=True), # nn.Linear(512, 2048)) # self.rcnn_bbox_pred_top = nn.Linear(2048, 4) # self.relu_top = nn.ReLU(inplace=True) self.rcnn_bbox_pred = nn.Conv2d(2048, 4, 3, 1, 1) else: self.rcnn_bbox_pred = nn.Linear(2048, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False)
def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') if self.use_self_attention: self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) else: self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(2048, 4) # self.rcnn_bbox_pred = nn.Conv2d(2048,4,3,1,1) else: self.rcnn_bbox_pred = nn.Linear(2048, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # attention if self.use_self_attention: self.spatial_attention = nn.Conv2d(2048, 1, 3, 1, 1)
def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # some 3d statistic # some 2d points projected from 3d # self.rcnn_3d_preds_new = nn.Linear(in_channels, 3 + 4 * self.num_bins) self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) # dims self.rcnn_dims_pred = nn.Sequential( *[nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, 3)]) # angle self.rcnn_angle_pred = nn.Sequential(*[ nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, self.num_bins * 2) ]) # angle conf self.rcnn_angle_conf_pred = nn.Sequential(*[ nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, self.num_bins * 2) ])
def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.modify_feature_extractor() self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.mask_rcnn_pooling = RoIAlignAvg(14, 14, 1.0 / 16.0) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_kp_loss = functools.partial(F.cross_entropy, reduce=False, ignore_index=-1) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # some 3d statistic # some 2d points projected from 3d self.rcnn_3d_pred = nn.Linear(in_channels, 3) # self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) # self.rcnn_3d_loss = MultiBinRegLoss(num_bins=self.num_bins) self.rcnn_3d_loss = OrientationLoss(split_loss=True) self.keypoint_predictor = KeyPointPredictor2(1024)
def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = ROIAlign((self.pooling_size, self.pooling_size), 1.0 / 16.0, 2) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') # self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) self.rcnn_cls_preds = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_preds = nn.Linear(in_channels, 4) else: self.rcnn_bbox_preds = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(self.n_classes) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # self.rcnn_3d_pred = nn.Linear(c, 3 + 4 + 11 + 2 + 1) if self.class_agnostic_3d: self.rcnn_3d_pred = nn.Linear(in_channels, 3 + 4 * self.num_bins) else: self.rcnn_3d_pred = nn.Linear( in_channels, 3 * self.n_classes + 4 * self.num_bins) # self.rcnn_3d_loss = OrientationLoss(split_loss=True) self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins)
def init_modules(self): self.feature_extractor = feature_extractors_builder.build( self.feature_extractor_config) # self.feature_extractor = ResNetFeatureExtractor( # self.feature_extractor_config) # self.feature_extractor = MobileNetFeatureExtractor( # self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Linear(self.ndin, self.n_classes) if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(self.ndin, 4) # self.rcnn_bbox_pred = nn.Conv2d(2048,4,3,1,1) else: self.rcnn_bbox_pred = nn.Linear(self.ndin, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2, gamma=2, alpha=0.25) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # attention if self.use_self_attention: self.spatial_attention = nn.Conv2d(self.ndin, 1, 3, 1, 1) self.rcnn_pooling2 = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 8.0) self.reduce_pooling = nn.Sequential( nn.Conv2d(512+1024, 1024, 1, 1, 0), nn.ReLU())
def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) self.rcnn_pooling2 = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 8.0) self.reduce_pooling = nn.Sequential( nn.Conv2d(1024 + 512, 1024, 1, 1, 0), nn.ReLU())
def init_modules(self): self.feature_extractor = FeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) self.rcnn_pooling_cls = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) self.rcnn_pooling_loc = PSRoIPool(7, 7, 1.0 / 16, 7, 4) self.rcnn_cls_base = nn.Conv2d( in_channels=1024, out_channels=self.n_classes * self.pooling_size * self.pooling_size, kernel_size=1, stride=1, padding=0, bias=False) self.rcnn_bbox_base = nn.Conv2d( in_channels=1024, out_channels=4 * self.pooling_size * self.pooling_size, kernel_size=1, stride=1, padding=0, bias=False) self.rcnn_top = nn.Conv2d(2048, 1024, 1, 1, 0, bias=False) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) # if self.class_agnostic: # self.rcnn_bbox_pred = nn.Linear(2048, 4) # else: # self.rcnn_bbox_pred = nn.Linear(2048, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False)
class Mono3DAngleNewFasterRCNN(Model): def forward(self, feed_dict): # import ipdb # ipdb.set_trace() prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) # proposals = prediction_dict['proposals_batch'] # shape(N,num_proposals,5) # pre subsample for reduce consume of memory if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) ################################### # 3d training ################################### mono_3d_pooled_feat = self.feature_extractor.third_stage_feature( pooled_feat.detach()) mono_3d_pooled_feat = mono_3d_pooled_feat.mean(3).mean(2) # rcnn_3d = self.rcnn_3d_preds_new(mono_3d_pooled_feat) # prediction_dict['rcnn_3d'] = rcnn_3d # shape(N,C,1,1) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) rcnn_cls_scores_map = self.rcnn_cls_pred(pooled_feat) rcnn_cls_scores = rcnn_cls_scores_map.mean(3).mean(2) saliency_map = F.softmax(rcnn_cls_scores_map, dim=1) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) pooled_feat = pooled_feat * saliency_map[:, 1:, :, :] # shape(N,C) if self.reduce: pooled_feat = pooled_feat.mean(3).mean(2) else: pooled_feat = pooled_feat.view(self.rcnn_batch_size, -1) rcnn_bbox_preds = self.rcnn_bbox_pred(pooled_feat) # rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) # rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) rcnn_3d_dims = self.rcnn_dims_pred(mono_3d_pooled_feat) rcnn_3d_angles = self.rcnn_angle_pred(mono_3d_pooled_feat).view( -1, self.num_bins, 2) rcnn_3d_angles_cls = self.rcnn_angle_conf_pred( mono_3d_pooled_feat).view(-1, self.num_bins, 2) rcnn_3d_angles_cls_reg = torch.cat( [rcnn_3d_angles_cls, rcnn_3d_angles], dim=-1).view(-1, self.num_bins * 4) rcnn_3d = torch.cat([rcnn_3d_dims, rcnn_3d_angles_cls_reg], dim=-1) prediction_dict['rcnn_3d'] = rcnn_3d prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] if not self.training: # import ipdb # ipdb.set_trace() dims = rcnn_3d[:, :3] angles = rcnn_3d[:, 3:].view(-1, self.num_bins, 4) angles_cls = F.softmax(angles[:, :, :2], dim=-1) _, angles_cls_argmax = torch.max(angles_cls[:, :, 1], dim=-1) row = torch.arange( 0, angles_cls_argmax.shape[0]).type_as(angles_cls_argmax) angles_oritations = angles[:, :, 2:][row, angles_cls_argmax] rcnn_3d = torch.cat([dims, angles_oritations], dim=-1) # import ipdb # ipdb.set_trace() rcnn_3d = self.target_assigner.bbox_coder_3d.decode_batch_angle( rcnn_3d, self.rcnn_3d_loss.bin_centers[angles_cls_argmax]) prediction_dict['rcnn_3d'] = rcnn_3d return prediction_dict def pre_forward(self): # params if self.train_3d and self.training and not self.train_2d: self.freeze_modules() for parameter in self.feature_extractor.third_stage_feature.parameters( ): parameter.requires_grad = True # for param in self.rcnn_3d_preds_new.parameters(): # param.requires_grad = True for param in self.rcnn_angle_conf_pred.parameters(): param.requires_grad = True for param in self.rcnn_angle_pred.parameters(): param.requires_grad = True for param in self.rcnn_dims_pred.parameters(): param.requires_grad = True self.freeze_bn(self) self.unfreeze_bn(self.feature_extractor.third_stage_feature) def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # some 3d statistic # some 2d points projected from 3d # self.rcnn_3d_preds_new = nn.Linear(in_channels, 3 + 4 * self.num_bins) self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) # dims self.rcnn_dims_pred = nn.Sequential( *[nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, 3)]) # angle self.rcnn_angle_pred = nn.Sequential(*[ nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, self.num_bins * 2) ]) # angle conf self.rcnn_angle_conf_pred = nn.Sequential(*[ nn.Linear(in_channels, 256), nn.ReLU(), nn.Linear(256, self.num_bins * 2) ]) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config[ 'feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # sampler self.sampler = BalancedSampler(model_config['sampler_config']) self.reduce = True self.visualizer = FeatVisualizer() self.num_bins = 2 self.train_3d = True # self.train_2d = not self.train_3d self.train_2d = True # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] # gt_boxes_3d = feed_dict['coords'] # dims_2d = feed_dict['dims_2d'] # use local angle # oritations = feed_dict['local_angle_oritation'] local_angle = feed_dict['local_angle'] # shape(N,7) gt_boxes_3d = feed_dict['gt_boxes_3d'] # orient # here just concat them # dims and their projection gt_boxes_3d = torch.cat([gt_boxes_3d[:, :, :3], local_angle], dim=-1) ########################## # assigner ########################## rcnn_cls_targets, rcnn_reg_targets,\ rcnn_cls_weights, rcnn_reg_weights,\ rcnn_reg_targets_3d, rcnn_reg_weights_3d = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_boxes_3d, gt_labels ) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] rcnn_reg_weights_3d = rcnn_reg_weights_3d[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict[ 'rcnn_reg_weights_3d'] = rcnn_reg_weights_3d / num_reg_coeff.float( ) prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets_3d'] = rcnn_reg_targets_3d[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} if self.train_2d: # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss ###################################### # 3d loss ###################################### rcnn_reg_weights_3d = prediction_dict['rcnn_reg_weights_3d'] rcnn_reg_targets_3d = prediction_dict['rcnn_reg_targets_3d'] rcnn_3d = prediction_dict['rcnn_3d'] if self.train_3d: # dims rcnn_3d_loss_dims = self.rcnn_bbox_loss( rcnn_3d[:, :3], rcnn_reg_targets_3d[:, :3]).sum(dim=-1) # angles rcnn_angle_loss, angle_tp_mask = self.rcnn_3d_loss( rcnn_3d[:, 3:], rcnn_reg_targets_3d[:, 3:]) rcnn_3d_loss = rcnn_3d_loss_dims * rcnn_reg_weights_3d rcnn_3d_loss = rcnn_3d_loss.sum(dim=-1) rcnn_angle_loss = rcnn_angle_loss * rcnn_reg_weights_3d rcnn_angle_loss = rcnn_angle_loss.sum(dim=-1) loss_dict['rcnn_3d_loss'] = rcnn_3d_loss loss_dict['rcnn_angle_loss'] = rcnn_angle_loss # angles stats angle_tp_mask = angle_tp_mask[rcnn_reg_weights_3d > 0] angles_tp_num = angle_tp_mask.int().sum().item() angles_all_num = angle_tp_mask.numel() else: angles_all_num = 0 angles_tp_num = 0 # store all stats in target assigner self.target_assigner.stat.update({ 'angle_num_tp': torch.tensor(0), 'angle_num_all': 1, # stats of orient 'orient_tp_num': 0, 'orient_tp_num2': 0, 'orient_tp_num3': 0, 'orient_all_num3': 0, # 'orient_pr': orient_pr, 'orient_all_num': 0, 'orient_tp_num4': 0, 'orient_all_num4': 0, 'cls_orient_2s_all_num': angles_all_num, 'cls_orient_2s_tp_num': angles_tp_num # 'angles_tp_num': angles_tp_num, # 'angles_all_num': angles_all_num }) # import ipdb # ipdb.set_trace() return loss_dict
class Mono3DFasterRCNN(Model): def forward(self, feed_dict): prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) if self.training and self.train_2d: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) # shape(N,C,1,1) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) rcnn_cls_scores_map = self.rcnn_cls_pred(pooled_feat) rcnn_cls_scores = rcnn_cls_scores_map.mean(3).mean(2) saliency_map = F.softmax(rcnn_cls_scores_map, dim=1) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) pooled_feat = pooled_feat * saliency_map[:, 1:, :, :] reduced_pooled_feat = pooled_feat.mean(3).mean(2) rcnn_bbox_preds = self.rcnn_bbox_pred(reduced_pooled_feat) # rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] ################################### # 3d training ################################### rcnn_bbox_preds = rcnn_bbox_preds.detach() final_bbox = self.target_assigner.bbox_coder.decode_batch( rcnn_bbox_preds.unsqueeze(0), rois_batch[:, :, 1:]) final_rois_inds = torch.zeros_like(final_bbox[:, :, -1:]) final_rois_batch = torch.cat([final_rois_inds, final_bbox], dim=-1) if self.training and self.train_3d: prediction_dict['rois_batch'] = final_rois_batch self.pre_subsample(prediction_dict, feed_dict) final_rois_batch = prediction_dict['rois_batch'] # shape(M,C,7,7) mono_3d_pooled_feat = self.rcnn_pooling(base_feat, final_rois_batch.view(-1, 5)) # H-concat to abbrevate the perspective transform # shape(N,M,9) # import ipdb # ipdb.set_trace() # concat with pooled feat # mono_3d_pooled_feat = torch.cat([mono_3d_pooled_feat, H_inv], dim=1) # mono_3d_pooled_feat = self.reduced_layer(mono_3d_pooled_feat) mono_3d_pooled_feat = self.feature_extractor.third_stage_feature( mono_3d_pooled_feat) mono_3d_pooled_feat = mono_3d_pooled_feat.mean(3).mean(2) if self.h_cat: H_inv = self.calc_Hinv(final_rois_batch, feed_dict['p2'], feed_dict['im_info'], base_feat.shape[-2:])[0].view(-1, 9) mono_3d_pooled_feat = torch.cat([mono_3d_pooled_feat, H_inv], dim=-1) rcnn_3d = self.rcnn_3d_pred(mono_3d_pooled_feat) # normalize to [0,1] # rcnn_3d[:, 5:11] = F.sigmoid(rcnn_3d[:, 5:11]) prediction_dict['rcnn_3d'] = rcnn_3d if not self.training: # rcnn_3d = self.target_assigner.bbox_coder_3d.decode_batch_bbox( # rcnn_3d, rois_batch) rcnn_3d = self.target_assigner.bbox_coder_3d.decode_batch_dims( rcnn_3d, final_rois_batch) prediction_dict['rcnn_3d'] = rcnn_3d return prediction_dict def calc_Hinv(self, final_rois_batch, p2, img_size, feat_size): p2 = p2[0] K_c = p2[:, :3] fx = K_c[0, 0] fy = K_c[1, 1] px = K_c[0, 2] py = K_c[1, 2] fw = self.pooling_size fh = self.pooling_size proposals = final_rois_batch[:, :, 1:] rw = (proposals[:, :, 2] - proposals[:, :, 0] + 1) / img_size[:, 1] * feat_size[1] rh = (proposals[:, :, 3] - proposals[:, :, 1] + 1) / img_size[:, 0] * feat_size[0] # rx = (proposals[:, :, 0] + proposals[:, :, 2]) / 2 # ry = (proposals[:, :, 1] + proposals[:, :, 3]) / 2 # roi camera intrinsic parameters sw = fw / rw sh = fh / rh fx_roi = fx * sw fy_roi = fy * sh zeros = torch.zeros_like(fx_roi) ones = torch.ones_like(fx_roi) px_roi = (px - proposals[:, :, 0]) * sw py_roi = (py - proposals[:, :, 1]) * sh K_roi = torch.stack( [fx_roi, zeros, px_roi, zeros, fy_roi, py_roi, zeros, zeros, ones], dim=-1).view(-1, 3, 3) H = K_roi.matmul(torch.inverse(K_c)) # import ipdb # ipdb.set_trace() # Too slow # H_inv = [] # for i in range(H.shape[0]): # H_inv.append(torch.inverse(H[i])) # H_inv = torch.stack(H_inv, dim=0) # import ipdb # ipdb.set_trace() H_np = H.cpu().numpy() H_inv_np = np.linalg.inv(H_np) H_inv = torch.from_numpy(H_inv_np).cuda().float() return H_inv.view(1, -1, 9) def pre_forward(self): # params if self.train_3d and self.training and not self.train_2d: self.freeze_modules() for parameter in self.feature_extractor.third_stage_feature.parameters( ): parameter.requires_grad = True for param in self.rcnn_3d_pred.parameters(): param.requires_grad = True self.freeze_bn(self) self.unfreeze_bn(self.feature_extractor.third_stage_feature) def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) # if self.train_3d and self.training: # self.freeze_modules() # for parameter in self.feature_extractor.third_stage_feature.parameters( # ): # parameter.requires_grad = True # for param in self.rcnn_3d_preds_new.parameters(): # param.requires_grad = True def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # some 3d statistic # some 2d points projected from 3d # self.rcnn_3d_pred = nn.Linear(in_channels, 3 + 4 + 3 + 1 + 4 + 2) if self.h_cat: c = in_channels + 9 else: c = in_channels # self.rcnn_3d_pred = nn.Linear(c, 3 + 4 + 11 + 2 + 1) self.rcnn_3d_pred = nn.Linear(c, 3 + 4 * 2) # self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) # self.rcnn_3d_loss = MultiBinRegLoss(num_bins=self.num_bins) self.rcnn_3d_loss = OrientationLoss(split_loss=True) # reduce for concat with the following layers # self.reduced_layer = nn.Sequential( # * [nn.Conv2d(1024 + 9, 1024, 1, 1, 0), nn.BatchNorm2d(1024)]) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config[ 'feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # sampler self.sampler = BalancedSampler(model_config['sampler_config']) # self.reduce = model_config.get('reduce') self.reduce = True self.visualizer = FeatVisualizer() self.num_bins = 4 self.train_3d = False self.train_2d = not self.train_3d # more accurate bbox for 3d prediction if self.train_3d: fg_thresh = 0.6 else: fg_thresh = 0.5 model_config['target_assigner_config']['fg_thresh'] = fg_thresh # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) self.profiler = Profiler() self.h_cat = False def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] # gt_boxes_3d = feed_dict['coords'] # dims_2d = feed_dict['dims_2d'] # use local angle # oritations = feed_dict['local_angle_oritation'] # local_angle = feed_dict['local_angle'] # shape(N,7) gt_boxes_3d = feed_dict['gt_boxes_3d'] # orient # cls_orient = torch.unsqueeze(feed_dict['cls_orient'], dim=-1).float() # reg_orient = feed_dict['reg_orient'] # orient = torch.cat([cls_orient, reg_orient], dim=-1) # h_2ds = feed_dict['h_2d'] # c_2ds = feed_dict['c_2d'] # r_2ds = feed_dict['r_2d'] # cls_orient_4s = feed_dict['cls_orient_4'] # center_orients = feed_dict['center_orient'] # distances = feed_dict['distance'] # d_ys = feed_dict['d_y'] # angles_camera = feed_dict['angles_camera'] # here just concat them # dims and their projection # gt_boxes_3d = torch.cat( # [gt_boxes_3d[:, :, :3], orient, distances, d_ys], dim=-1) encoded_side_points = feed_dict['encoded_side_points'] gt_boxes_3d = torch.cat([gt_boxes_3d[:, :, :3], encoded_side_points], dim=-1) ########################## # assigner ########################## rcnn_cls_targets, rcnn_reg_targets,\ rcnn_cls_weights, rcnn_reg_weights,\ rcnn_reg_targets_3d, rcnn_reg_weights_3d = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_boxes_3d, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] rcnn_reg_weights_3d = rcnn_reg_weights_3d[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' # assert num_reg_coeff, 'bug happens' if num_reg_coeff == 0: num_reg_coeff = torch.ones_like(num_reg_coeff) prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict[ 'rcnn_reg_weights_3d'] = rcnn_reg_weights_3d / num_reg_coeff.float( ) prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets_3d'] = rcnn_reg_targets_3d[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} if self.train_2d: # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss ###################################### # 3d loss ###################################### rcnn_reg_weights_3d = prediction_dict['rcnn_reg_weights_3d'] rcnn_reg_targets_3d = prediction_dict['rcnn_reg_targets_3d'] rcnn_3d = prediction_dict['rcnn_3d'] if self.train_3d: rcnn_3d_loss = self.rcnn_bbox_loss(rcnn_3d, rcnn_reg_targets_3d).sum(dim=-1) rcnn_3d_loss = rcnn_3d_loss * rcnn_reg_weights_3d # dims # rcnn_3d_loss_dims = self.rcnn_bbox_loss( # rcnn_3d[:, :3], rcnn_reg_targets_3d[:, :3]).sum(dim=-1) # # angles # res = self.rcnn_3d_loss(rcnn_3d[:, 3:], rcnn_reg_targets_3d[:, 3:]) # for res_loss_key in res: # tmp = res[res_loss_key] * rcnn_reg_weights_3d # res[res_loss_key] = tmp.sum(dim=-1) # loss_dict.update(res) # rcnn_3d_loss = rcnn_3d_loss_dims * rcnn_reg_weights_3d # rcnn_3d_loss = rcnn_3d_loss.sum(dim=-1) loss_dict['rcnn_3d_loss'] = rcnn_3d_loss # stats of orients # cls_orient_preds = rcnn_3d[:, 3:5] # cls_orient = rcnn_reg_targets_3d[:, 3] # _, cls_orient_preds_argmax = torch.max(cls_orient_preds, dim=-1) # orient_tp_mask = cls_orient.type_as( # cls_orient_preds_argmax) == cls_orient_preds_argmax # mask = (rcnn_reg_weights_3d > 0) & (rcnn_reg_targets_3d[:, 3] > -1) # orient_tp_mask = orient_tp_mask[mask] # orient_tp_num = orient_tp_mask.int().sum().item() # orient_all_num = orient_tp_mask.numel() # # depth ind ap # depth_ind_preds = rcnn_3d[:, 7:7 + 11] # depth_ind_targets = rcnn_reg_targets_3d[:, 6] # _, depth_ind_preds_argmax = torch.max(depth_ind_preds, dim=-1) # depth_ind_mask = depth_ind_targets.type_as( # depth_ind_preds_argmax) == depth_ind_preds_argmax # depth_ind_mask = depth_ind_mask[rcnn_reg_weights_3d > 0] # depth_ind_tp_num = depth_ind_mask.int().sum().item() # depth_ind_all_num = depth_ind_mask.numel() # # this mask is converted from reg methods # r_2ds_dis = torch.zeros_like(cls_orient) # r_2ds = rcnn_3d[:, 10] # r_2ds_dis[r_2ds < 0.5] = 0 # r_2ds_dis[r_2ds > 0.5] = 1 # orient_tp_mask2 = (r_2ds_dis == cls_orient) # orient_tp_mask2 = orient_tp_mask2[mask] # orient_tp_num2 = orient_tp_mask2.int().sum().item() # # cls_orient_4s # cls_orient_4s_pred = rcnn_3d[:, 11:15] # _, cls_orient_4s_inds = torch.max(cls_orient_4s_pred, dim=-1) # cls_orient_4s = rcnn_reg_targets_3d[:, 10] # # cls_orient_4s_inds[(cls_orient_4s_inds == 0) | (cls_orient_4s_inds == 2 # # )] = 1 # # cls_orient_4s_inds[(cls_orient_4s_inds == 1) | (cls_orient_4s_inds == 3 # # )] = 0 # orient_tp_mask3 = cls_orient_4s_inds.type_as( # cls_orient_4s) == cls_orient_4s # mask3 = (rcnn_reg_weights_3d > 0) # orient_tp_mask3 = orient_tp_mask3[mask3] # orient_4s_tp_num = orient_tp_mask3.int().sum().item() # orient_all_num3 = orient_tp_mask3.numel() # # test cls_orient_4s(check label) # cls_orient_2s_inds = torch.zeros_like(cls_orient) # cls_orient_2s_inds[(cls_orient_4s == 0) | (cls_orient_4s == 2)] = 1 # cls_orient_2s_inds[(cls_orient_4s == 1) | (cls_orient_4s == 3)] = 0 # cls_orient_2s_mask = (cls_orient_2s_inds == cls_orient) # cls_orient_2s_mask = cls_orient_2s_mask[mask] # cls_orient_2s_tp_num = cls_orient_2s_mask.int().sum().item() # cls_orient_2s_all_num = cls_orient_2s_mask.numel() # # center_orient # center_orients_preds = rcnn_3d[:, 15:17] # _, center_orients_inds = torch.max(center_orients_preds, dim=-1) # center_orients = rcnn_reg_targets_3d[:, 11] # orient_tp_mask4 = center_orients.type_as( # center_orients_inds) == center_orients_inds # mask4 = (rcnn_reg_weights_3d > 0) & (center_orients > -1) # orient_tp_mask4 = orient_tp_mask4[mask4] # orient_tp_num4 = orient_tp_mask4.int().sum().item() # orient_all_num4 = orient_tp_mask4.numel() # store all stats in target assigner # self.target_assigner.stat.update({ # # 'angle_num_tp': torch.tensor(0), # # 'angle_num_all': 1, # # stats of orient # 'orient_tp_num': orient_tp_num, # # 'orient_tp_num2': orient_tp_num2, # # 'orient_tp_num3': orient_4s_tp_num, # # 'orient_all_num3': orient_all_num3, # # 'orient_pr': orient_pr, # 'orient_all_num': orient_all_num, # # 'orient_tp_num4': orient_tp_num4, # # 'orient_all_num4': orient_all_num4, # 'cls_orient_2s_all_num': depth_ind_all_num, # 'cls_orient_2s_tp_num': depth_ind_tp_num # }) return loss_dict
class Mono3DFinalFasterRCNN(Model): def forward(self, feed_dict): prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) # shape(N,C,1,1) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) rcnn_cls_scores_map = self.rcnn_cls_pred(pooled_feat) rcnn_cls_scores = rcnn_cls_scores_map.mean(3).mean(2) saliency_map = F.softmax(rcnn_cls_scores_map, dim=1) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) pooled_feat = pooled_feat * saliency_map[:, 1:, :, :] reduced_pooled_feat = pooled_feat.mean(3).mean(2) rcnn_bbox_preds = self.rcnn_bbox_pred(reduced_pooled_feat) # rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) rcnn_3d = self.rcnn_3d_pred(reduced_pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] ################################### # 3d training ################################### # if self.training and self.train_3d: # prediction_dict['rois_batch'] = final_rois_batch # self.pre_subsample(prediction_dict, feed_dict) # final_rois_batch = prediction_dict['rois_batch'] # shape(M,C,7,7) # mono_3d_pooled_feat = self.rcnn_pooling(base_feat, # final_rois_batch.view(-1, 5)) # mono_3d_pooled_feat = self.feature_extractor.third_stage_feature( # mono_3d_pooled_feat) # mono_3d_pooled_feat = mono_3d_pooled_feat.mean(3).mean(2) prediction_dict['rcnn_3d'] = rcnn_3d if not self.training: rcnn_bbox_preds = rcnn_bbox_preds.detach() final_bbox = self.target_assigner.bbox_coder.decode_batch( rcnn_bbox_preds.unsqueeze(0), rois_batch[:, :, 1:]) final_rois_inds = torch.zeros_like(final_bbox[:, :, -1:]) final_rois_batch = torch.cat([final_rois_inds, final_bbox], dim=-1) rcnn_3d = self.target_assigner.bbox_coder_3d.decode_batch_bbox( rcnn_3d, final_rois_batch) prediction_dict['rcnn_3d'] = rcnn_3d return prediction_dict def pre_forward(self): pass # params # if self.train_3d and self.training and not self.train_2d: # self.freeze_modules() # for parameter in self.feature_extractor.third_stage_feature.parameters( # ): # parameter.requires_grad = True # for param in self.rcnn_3d_pred.parameters(): # param.requires_grad = True # self.freeze_bn(self) # self.unfreeze_bn(self.feature_extractor.third_stage_feature) def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # self.rcnn_3d_pred = nn.Linear(c, 3 + 4 + 11 + 2 + 1) self.rcnn_3d_pred = nn.Linear(in_channels, 3 + 4 * 2) self.rcnn_3d_loss = OrientationLoss(split_loss=True) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config[ 'feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # sampler self.sampler = BalancedSampler(model_config['sampler_config']) # self.reduce = model_config.get('reduce') self.reduce = True self.visualizer = FeatVisualizer() self.num_bins = 4 # more accurate bbox for 3d prediction # if self.train_3d: # fg_thresh = 0.6 # else: # fg_thresh = 0.5 # model_config['target_assigner_config']['fg_thresh'] = fg_thresh # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) self.profiler = Profiler() self.h_cat = False def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] # shape(N,7) gt_boxes_3d = feed_dict['gt_boxes_3d'] # orient cls_orient = torch.unsqueeze(feed_dict['cls_orient'], dim=-1).float() reg_orient = feed_dict['reg_orient'] orient = torch.cat([cls_orient, reg_orient], dim=-1) gt_boxes_3d = torch.cat([gt_boxes_3d[:, :, :3], orient], dim=-1) ########################## # assigner ########################## rcnn_cls_targets, rcnn_reg_targets,\ rcnn_cls_weights, rcnn_reg_weights,\ rcnn_reg_targets_3d, rcnn_reg_weights_3d = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_boxes_3d, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] rcnn_reg_weights_3d = rcnn_reg_weights_3d[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' # assert num_reg_coeff, 'bug happens' if num_reg_coeff == 0: num_reg_coeff = torch.ones_like(num_reg_coeff) prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict[ 'rcnn_reg_weights_3d'] = rcnn_reg_weights_3d / num_reg_coeff.float( ) prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets_3d'] = rcnn_reg_targets_3d[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ # import ipdb # ipdb.set_trace() loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss ###################################### # 3d loss ###################################### rcnn_reg_weights_3d = prediction_dict['rcnn_reg_weights_3d'] rcnn_reg_targets_3d = prediction_dict['rcnn_reg_targets_3d'] rcnn_3d = prediction_dict['rcnn_3d'] # dims rcnn_3d_loss_dims = self.rcnn_bbox_loss( rcnn_3d[:, :3], rcnn_reg_targets_3d[:, :3]).sum(dim=-1) # angles res = self.rcnn_3d_loss(rcnn_3d[:, 3:], rcnn_reg_targets_3d[:, 3:]) for res_loss_key in res: tmp = res[res_loss_key] * rcnn_reg_weights_3d res[res_loss_key] = tmp.sum(dim=-1) loss_dict.update(res) rcnn_3d_loss = rcnn_3d_loss_dims * rcnn_reg_weights_3d rcnn_3d_loss = rcnn_3d_loss.sum(dim=-1) loss_dict['rcnn_3d_loss'] = rcnn_3d_loss # stats of orients cls_orient_preds = rcnn_3d[:, 3:5] cls_orient = rcnn_reg_targets_3d[:, 3] _, cls_orient_preds_argmax = torch.max(cls_orient_preds, dim=-1) orient_tp_mask = cls_orient.type_as( cls_orient_preds_argmax) == cls_orient_preds_argmax mask = (rcnn_reg_weights_3d > 0) & (rcnn_reg_targets_3d[:, 3] > -1) orient_tp_mask = orient_tp_mask[mask] orient_tp_num = orient_tp_mask.int().sum().item() orient_all_num = orient_tp_mask.numel() # store all stats in target assigner self.target_assigner.stat.update({ # 'angle_num_tp': torch.tensor(0), # 'angle_num_all': 1, # stats of orient 'orient_tp_num': orient_tp_num, # 'orient_tp_num2': orient_tp_num2, # 'orient_tp_num3': orient_4s_tp_num, # 'orient_all_num3': orient_all_num3, # 'orient_pr': orient_pr, 'orient_all_num': orient_all_num, # 'orient_tp_num4': orient_tp_num4, # 'orient_all_num4': orient_all_num4, # 'cls_orient_2s_all_num': depth_ind_all_num, # 'cls_orient_2s_tp_num': depth_ind_tp_num }) return loss_dict
class Mono3DSimplerFasterRCNN(Model): def forward(self, feed_dict): # import ipdb # ipdb.set_trace() prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) mask_pooled_feat = self.mask_rcnn_pooling(base_feat, rois_batch.view(-1, 5)) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) # common_pooled_feat = pooled_feat rcnn_cls_scores_map = self.rcnn_cls_pred(pooled_feat) rcnn_cls_scores = rcnn_cls_scores_map.mean(3).mean(2) saliency_map = F.softmax(rcnn_cls_scores_map, dim=1) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) pooled_feat = pooled_feat * saliency_map[:, 1:, :, :] reduced_pooled_feat = pooled_feat.mean(3).mean(2) rcnn_bbox_preds = self.rcnn_bbox_pred(reduced_pooled_feat) # rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] ################################### # 3d training ################################### keypoint_heatmap = self.keypoint_predictor(mask_pooled_feat) keypoint_scores = keypoint_heatmap.view(-1, 56 * 56) keypoint_probs = F.softmax(keypoint_scores, dim=-1) prediction_dict['keypoint_probs'] = keypoint_probs prediction_dict['keypoint_scores'] = keypoint_scores # import ipdb # ipdb.set_trace() rcnn_3d = self.rcnn_3d_pred(reduced_pooled_feat) prediction_dict['rcnn_3d'] = rcnn_3d if not self.training: # import ipdb # ipdb.set_trace() # _, keypoint_peak_pos = keypoint_probs.max(dim=-1) keypoints = self.keypoint_coder.decode_keypoint_heatmap( rois_batch[0, :, 1:], keypoint_probs.view(-1, 4, 56 * 56)) prediction_dict['keypoints'] = keypoints return prediction_dict def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) def modify_feature_extractor(self): from torchvision.models.resnet import Bottleneck layer4 = self._make_layer(Bottleneck, 512, 3, stride=1) self.feature_extractor.second_stage_feature = layer4 def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.modify_feature_extractor() self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.mask_rcnn_pooling = RoIAlignAvg(14, 14, 1.0 / 16.0) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_kp_loss = functools.partial(F.cross_entropy, reduce=False, ignore_index=-1) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # some 3d statistic # some 2d points projected from 3d self.rcnn_3d_pred = nn.Linear(in_channels, 3) # self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) # self.rcnn_3d_loss = MultiBinRegLoss(num_bins=self.num_bins) self.rcnn_3d_loss = OrientationLoss(split_loss=True) self.keypoint_predictor = KeyPointPredictor2(1024) def _make_layer(self, block, planes, blocks, stride=1): inplanes = 1024 downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(inplanes, planes, stride, downsample)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(inplanes, planes)) return nn.Sequential(*layers) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config[ 'feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # sampler self.sampler = BalancedSampler(model_config['sampler_config']) # self.reduce = model_config.get('reduce') self.reduce = True self.visualizer = FeatVisualizer() self.num_bins = 4 # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) self.keypoint_coder = self.target_assigner.keypoint_coder self.profiler = Profiler() def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] # shape(N,7) gt_boxes_3d = feed_dict['gt_boxes_3d'] keypoint_gt = feed_dict['keypoint_gt'] # import ipdb # ipdb.set_trace() gt_boxes_3d = torch.cat([gt_boxes_3d[:, :, :3], keypoint_gt], dim=-1) ########################## # assigner ########################## rcnn_cls_targets, rcnn_reg_targets,\ rcnn_cls_weights, rcnn_reg_weights,\ rcnn_reg_targets_3d, rcnn_reg_weights_3d = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_boxes_3d, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] rcnn_reg_weights_3d = rcnn_reg_weights_3d[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict[ 'rcnn_reg_weights_3d'] = rcnn_reg_weights_3d / num_reg_coeff.float( ) prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets_3d'] = rcnn_reg_targets_3d[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss # keypoint heatmap loss # keypoint_gt = feed_dict['keypoint_gt'] # import ipdb # ipdb.set_trace() rcnn_reg_targets_3d = prediction_dict['rcnn_reg_targets_3d'] rcnn_reg_weights_3d = prediction_dict['rcnn_reg_weights_3d'] keypoint_scores = prediction_dict['keypoint_scores'] keypoint_gt = rcnn_reg_targets_3d[:, 3:].contiguous().view(-1, 2) keypoint_weights = keypoint_gt[:, 1] keypoint_pos = keypoint_gt[:, 0] keypoint_pos[keypoint_weights == 0] = -1 keypoint_loss = self.rcnn_kp_loss(keypoint_scores, keypoint_pos.long()) keypoint_loss = keypoint_loss.view( -1, 4) * rcnn_reg_weights_3d.unsqueeze(-1) # keypoint_loss = keypoint_loss * keypoint_weights loss_dict['keypoint_loss'] = keypoint_loss.sum(dim=-1).sum(dim=-1) # dims loss rcnn_3d = prediction_dict['rcnn_3d'] rcnn_3d_loss = self.rcnn_bbox_loss(rcnn_3d, rcnn_reg_targets_3d[:, :3]) rcnn_3d_loss = rcnn_3d_loss * rcnn_reg_weights_3d.sum(dim=-1) loss_dict['rcnn_3d_loss'] = rcnn_3d_loss.sum(dim=-1).sum(dim=-1) return loss_dict
class SemanticFasterRCNN(Model): def forward(self, feed_dict): prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # batch_size = base_feat.shape[0] # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) # proposals = prediction_dict['proposals_batch'] # shape(N,num_proposals,5) # pre subsample for reduce consume of memory if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) # shape(N,C,1,1) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) # semantic map if self.use_self_attention: pooled_feat_cls = pooled_feat.mean(3).mean(2) rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat_cls) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) # self-attention channel_attention = self.generate_channel_attention(pooled_feat) spatial_attention = self.generate_spatial_attention(pooled_feat) pooled_feat_reg = pooled_feat * channel_attention pooled_feat_reg = pooled_feat * spatial_attention pooled_feat_reg = pooled_feat_reg.mean(3).mean(2) rcnn_bbox_preds = self.rcnn_bbox_pred(pooled_feat_reg) else: rcnn_cls_scores_map = self.rcnn_cls_pred(pooled_feat) rcnn_cls_scores = rcnn_cls_scores_map.mean(3).mean(2) saliency_map = F.softmax(rcnn_cls_scores_map, dim=1) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) # rcnn_cls_probs = rcnn_cls_probs_map.mean(3).mean(2) # shape(N,C) rcnn_bbox_feat = pooled_feat * saliency_map[:, 1:, :, :] # rcnn_bbox_feat = torch.cat([rcnn_bbox_feat, pooled_feat], dim=1) rcnn_bbox_feat = rcnn_bbox_feat.mean(3).mean(2) # if self.use_score: # pooled_feat = rcnn_bbox_preds = self.rcnn_bbox_pred(rcnn_bbox_feat) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][0][ proposals_order] return prediction_dict def generate_channel_attention(self, feat): return feat.mean(3, keepdim=True).mean(2, keepdim=True) def generate_spatial_attention(self, feat): return self.spatial_attention(feat) def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') if self.use_self_attention: self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) else: self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(2048, 4) # self.rcnn_bbox_pred = nn.Conv2d(2048,4,3,1,1) else: self.rcnn_bbox_pred = nn.Linear(2048, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial(F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # attention if self.use_self_attention: self.spatial_attention = nn.Conv2d(2048, 1, 3, 1, 1) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] self.use_self_attention = model_config.get('use_self_attention') # some submodule config self.feature_extractor_config = model_config[ 'feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) # sampler self.sampler = BalancedSampler(model_config['sampler_config']) def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] ########################## # assigner ########################## # import ipdb # ipdb.set_trace() rcnn_cls_targets, rcnn_reg_targets, rcnn_cls_weights, rcnn_reg_weights = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) if not self.training: # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['proposals_order'] = proposals_order[ batch_sampled_mask] def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights # rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) # loss weights has no gradients loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss # add rcnn_cls_targets to get the statics of rpn # loss_dict['rcnn_cls_targets'] = rcnn_cls_targets return loss_dict
class Mono3DFinalAngleFasterRCNN(Model): def forward(self, feed_dict): self.target_assigner.bbox_coder_3d.mean_dims = feed_dict['mean_dims'] prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) feed_dict.update({'base_feat': base_feat}) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) # shape(N,C,1,1) second_pooled_feat = self.feature_extractor.second_stage_feature( pooled_feat) second_pooled_feat = second_pooled_feat.mean(3).mean(2) rcnn_cls_scores = self.rcnn_cls_preds(second_pooled_feat) rcnn_bbox_preds = self.rcnn_bbox_preds(second_pooled_feat) rcnn_3d = self.rcnn_3d_pred(second_pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] ################################### # 3d training ################################### prediction_dict['rcnn_3d'] = rcnn_3d if not self.training: if self.class_agnostic_3d: orient = rcnn_3d[:, 3:] dims = rcnn_3d[:, :3] else: orient = rcnn_3d[:, 3 * self.n_classes:] dims = rcnn_3d[:, :3 * self.n_classes] angles = orient.view(-1, self.num_bins, 4) angles_cls = F.softmax(angles[:, :, :2], dim=-1) _, angles_cls_argmax = torch.max(angles_cls[:, :, 1], dim=-1) row = torch.arange( 0, angles_cls_argmax.shape[0]).type_as(angles_cls_argmax) angles_oritations = angles[:, :, 2:][row, angles_cls_argmax] rcnn_3d = torch.cat([dims, angles_oritations], dim=-1) # import ipdb # ipdb.set_trace() rcnn_3d = self.target_assigner.bbox_coder_3d.decode_batch_angle( rcnn_3d, self.rcnn_3d_loss.bin_centers[angles_cls_argmax]) prediction_dict['rcnn_3d'] = rcnn_3d return prediction_dict def pre_forward(self): pass def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_preds, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_preds, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = ROIAlign((self.pooling_size, self.pooling_size), 1.0 / 16.0, 2) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') # self.rcnn_cls_pred = nn.Conv2d(2048, self.n_classes, 3, 1, 1) self.rcnn_cls_preds = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_preds = nn.Linear(in_channels, 4) else: self.rcnn_bbox_preds = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(self.n_classes) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) # self.rcnn_3d_pred = nn.Linear(c, 3 + 4 + 11 + 2 + 1) if self.class_agnostic_3d: self.rcnn_3d_pred = nn.Linear(in_channels, 3 + 4 * self.num_bins) else: self.rcnn_3d_pred = nn.Linear( in_channels, 3 * self.n_classes + 4 * self.num_bins) # self.rcnn_3d_loss = OrientationLoss(split_loss=True) self.rcnn_3d_loss = MultiBinLoss(num_bins=self.num_bins) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) + 1 self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.class_agnostic_3d = model_config['class_agnostic_3d'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config['feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # sampler self.sampler = BalancedSampler(model_config['sampler_config']) # self.reduce = model_config.get('reduce') self.reduce = True self.visualizer = FeatVisualizer() self.num_bins = 4 # more accurate bbox for 3d prediction # if self.train_3d: # fg_thresh = 0.6 # else: # fg_thresh = 0.5 # model_config['target_assigner_config']['fg_thresh'] = fg_thresh # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) self.profiler = Profiler() self.h_cat = False def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] # gt_boxes_3d = feed_dict['coords'] # dims_2d = feed_dict['dims_2d'] # use local angle # oritations = feed_dict['local_angle_oritation'] local_angle = feed_dict['local_angle'] # shape(N,7) gt_boxes_3d = feed_dict['gt_boxes_3d'] # orient # here just concat them # dims and their projection gt_boxes_3d = torch.cat([gt_boxes_3d[:, :, :3], local_angle], dim=-1) ########################## # assigner ########################## rcnn_cls_targets, rcnn_reg_targets,\ rcnn_cls_weights, rcnn_reg_weights,\ rcnn_reg_targets_3d, rcnn_reg_weights_3d = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_boxes_3d, gt_labels ) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] rcnn_reg_weights_3d = rcnn_reg_weights_3d[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict[ 'rcnn_reg_weights_3d'] = rcnn_reg_weights_3d / num_reg_coeff.float() prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets_3d'] = rcnn_reg_targets_3d[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) def squeeze_bbox_preds(self, rcnn_bbox_preds, rcnn_cls_targets, out_c=4): """ squeeze rcnn_bbox_preds from shape (N, 4 * num_classes) to shape (N, 4) Args: rcnn_bbox_preds: shape(N, num_classes, 4) rcnn_cls_targets: shape(N, 1) """ rcnn_bbox_preds = rcnn_bbox_preds.view(-1, self.n_classes, out_c) batch_size = rcnn_bbox_preds.shape[0] offset = torch.arange(0, batch_size) * rcnn_bbox_preds.size(1) rcnn_cls_targets = rcnn_cls_targets + offset.type_as(rcnn_cls_targets) rcnn_bbox_preds = rcnn_bbox_preds.contiguous().view( -1, out_c)[rcnn_cls_targets] return rcnn_bbox_preds def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ # import ipdb # ipdb.set_trace() loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] # if not self.class_agnostic: rcnn_bbox_preds = self.squeeze_bbox_preds(rcnn_bbox_preds, rcnn_cls_targets) rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss ###################################### # 3d loss ###################################### rcnn_reg_weights_3d = prediction_dict['rcnn_reg_weights_3d'] rcnn_reg_targets_3d = prediction_dict['rcnn_reg_targets_3d'] rcnn_3d = prediction_dict['rcnn_3d'] if not self.class_agnostic_3d: dims_pred = rcnn_3d[:, :3 * self.n_classes] dims_pred = self.squeeze_bbox_preds(dims_pred, rcnn_cls_targets, 3) orient_pred = rcnn_3d[:, 3 * self.n_classes:] else: dims_pred = rcnn_3d[:, :3] orient_pred = rcnn_3d[:, 3:] # dims rcnn_3d_loss_dims = self.rcnn_bbox_loss( dims_pred, rcnn_reg_targets_3d[:, :3]).sum(dim=-1) # angles rcnn_angle_loss, angle_tp_mask = self.rcnn_3d_loss( orient_pred, rcnn_reg_targets_3d[:, 3:]) # angles # res = self.rcnn_3d_loss(rcnn_3d[:, 3:], rcnn_reg_targets_3d[:, 3:6]) # for res_loss_key in res: # tmp = res[res_loss_key] * rcnn_reg_weights_3d # res[res_loss_key] = tmp.sum(dim=-1) # loss_dict.update(res) rcnn_3d_loss = rcnn_3d_loss_dims * rcnn_reg_weights_3d rcnn_3d_loss = rcnn_3d_loss.sum(dim=-1) rcnn_angle_loss = rcnn_angle_loss * rcnn_reg_weights_3d rcnn_angle_loss = rcnn_angle_loss.sum(dim=-1) loss_dict['rcnn_3d_loss'] = rcnn_3d_loss loss_dict['rcnn_angle_loss'] = rcnn_angle_loss # stats of orients angle_tp_mask = angle_tp_mask[rcnn_reg_weights_3d > 0] angles_tp_num = angle_tp_mask.int().sum().item() angles_all_num = angle_tp_mask.numel() # cls_orient_preds = rcnn_3d[:, 3:5] # cls_orient = rcnn_reg_targets_3d[:, 3] # _, cls_orient_preds_argmax = torch.max(cls_orient_preds, dim=-1) # orient_tp_mask = cls_orient.type_as( # cls_orient_preds_argmax) == cls_orient_preds_argmax # mask = (rcnn_reg_weights_3d > 0) & (rcnn_reg_targets_3d[:, 3] > -1) # orient_tp_mask = orient_tp_mask[mask] # orient_tp_num = orient_tp_mask.int().sum().item() # orient_all_num = orient_tp_mask.numel() # gt_boxes_proj = feed_dict['gt_boxes_proj'] self.target_assigner.stat.update({ 'cls_orient_2s_all_num': angles_all_num, 'cls_orient_2s_tp_num': angles_tp_num # 'angle_num_tp': torch.tensor(0), # 'angle_num_all': 1, # stats of orient # 'orient_tp_num': orient_tp_num, # 'orient_tp_num2': orient_tp_num2, # 'orient_tp_num3': orient_tp_num3, # 'orient_all_num3': orient_all_num3, # 'orient_pr': orient_pr, # 'orient_all_num': orient_all_num, # 'orient_all_num3': orient_all_num3, # 'orient_tp_num4': orient_tp_num4, # 'orient_all_num4': orient_all_num4, # 'cls_orient_2s_all_num': depth_ind_all_num, # 'cls_orient_2s_tp_num': depth_ind_tp_num }) return loss_dict
class SINetModel(Model): def collect_intermedia_layers(self, img): feat2 = self.feature_extractor.first_stage_feature[:-1](img) feat3 = self.feature_extractor.first_stage_feature[-1](feat2) end_points = {'feat2': feat2, 'feat3': feat3} return feat3, end_points def caroi_pooling(self, all_feats, rois_batch, out_channels): pooled_feats = [] for feat in all_feats: pooled_feats.append(self.rcnn_pooling(feat, rois_batch)) pooled_feats = torch.cat(pooled_feats, dim=1) if pooled_feats.shape[1] != out_channels: # add 1x1 conv pooled_feats = self.reduce_pooling(pooled_feats) return pooled_feats def forward(self, feed_dict): prediction_dict = {} # base model # base_feat = self.feature_extractor.first_stage_feature( # feed_dict['img']) base_feat, all_feats = self.collect_intermedia_layers(feed_dict['img']) feed_dict.update({'base_feat': base_feat}) self.add_feat('base_feat', base_feat) # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) # proposals = prediction_dict['proposals_batch'] # shape(N,num_proposals,5) # pre subsample for reduce consume of memory if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) # pooled_feat = self.rcnn_pooling(base_feat, rois_batch.view(-1, 5)) # import ipdb # ipdb.set_trace() pooled_feat = self.caroi_pooling( all_feats, rois_batch.view(-1, 5), out_channels=1024) # shape(N,C,1,1) pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) # shape(N,C) if self.reduce: pooled_feat = pooled_feat.mean(3).mean(2) else: pooled_feat = pooled_feat.view(self.rcnn_batch_size, -1) rcnn_bbox_preds = self.rcnn_bbox_pred(pooled_feat) rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][ proposals_order] return prediction_dict def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_pred, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_pred, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = ResNetFeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) if self.pooling_mode == 'align': self.rcnn_pooling = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 16.0) elif self.pooling_mode == 'ps': self.rcnn_pooling = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) elif self.pooling_mode == 'psalign': raise NotImplementedError('have not implemented yet!') elif self.pooling_mode == 'deformable_psalign': raise NotImplementedError('have not implemented yet!') self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) if self.reduce: in_channels = 2048 else: in_channels = 2048 * 4 * 4 if self.class_agnostic: self.rcnn_bbox_pred = nn.Linear(in_channels, 4) else: self.rcnn_bbox_pred = nn.Linear(in_channels, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) self.rcnn_pooling2 = RoIAlignAvg(self.pooling_size, self.pooling_size, 1.0 / 8.0) self.reduce_pooling = nn.Sequential( nn.Conv2d(1024 + 512, 1024, 1, 1, 0), nn.ReLU()) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config['feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) # sampler self.sampler = BalancedSampler(model_config['sampler_config']) # self.reduce = model_config.get('reduce') self.reduce = True # self.visualizer = FeatVisualizer() def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] ########################## # assigner ########################## # import ipdb # ipdb.set_trace() rcnn_cls_targets, rcnn_reg_targets, rcnn_cls_weights, rcnn_reg_weights = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_reg_weights > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] num_cls_coeff = (rcnn_cls_weights > 0).sum(dim=-1) num_reg_coeff = (rcnn_reg_weights > 0).sum(dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] prediction_dict['fake_match'] = self.target_assigner.analyzer.match[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) if not self.training: # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['proposals_order'] = proposals_order[ batch_sampled_mask] def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights # rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) # loss weights has no gradients loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss # add rcnn_cls_targets to get the statics of rpn # loss_dict['rcnn_cls_targets'] = rcnn_cls_targets # analysis ap rcnn_cls_probs = prediction_dict['rcnn_cls_probs'] num_gt = feed_dict['gt_labels'].numel() fake_match = prediction_dict['fake_match'] self.target_assigner.analyzer.analyze_ap( fake_match, rcnn_cls_probs[:, 1], num_gt, thresh=0.5) return loss_dict
class RFCNModel(Model): def forward(self, feed_dict): prediction_dict = {} # base model base_feat = self.feature_extractor.first_stage_feature( feed_dict['img']) top_feat = self.feature_extractor.second_stage_feature(base_feat) top_feat = self.rcnn_top(top_feat) top_feat = F.relu(top_feat) feed_dict.update({'base_feat': base_feat}) # batch_size = base_feat.shape[0] # rpn model prediction_dict.update(self.rpn_model.forward(feed_dict)) # proposals = prediction_dict['proposals_batch'] # shape(N,num_proposals,5) # pre subsample for reduce consume of memory if self.training: self.pre_subsample(prediction_dict, feed_dict) rois_batch = prediction_dict['rois_batch'] # note here base_feat (N,C,H,W),rois_batch (N,num_proposals,5) # ps roi pooling for cls and bbox import ipdb ipdb.set_trace() cls_pooling_feat = self.rcnn_cls_base(top_feat) rcnn_cls_scores = self.rcnn_pooling_cls(cls_pooling_feat, rois_batch.view(-1, 5)) bbox_pooling_feat = self.rcnn_bbox_base(top_feat) rcnn_bbox_preds = self.rcnn_pooling_loc(bbox_pooling_feat, rois_batch.view(-1, 5)) # shape(N,C,1,1) # pooled_feat = self.feature_extractor.second_stage_feature(pooled_feat) # shape(N,C) rcnn_cls_scores = rcnn_cls_scores.mean(3).mean(2) rcnn_bbox_preds = rcnn_bbox_preds.mean(3).mean(2) # rcnn_bbox_preds = self.rcnn_bbox_pred(pooled_feat) # rcnn_cls_scores = self.rcnn_cls_pred(pooled_feat) rcnn_cls_probs = F.softmax(rcnn_cls_scores, dim=1) if not self.training: prediction_dict['rcnn_cls_probs'] = rcnn_cls_probs prediction_dict['rcnn_bbox_preds'] = rcnn_bbox_preds prediction_dict['rcnn_cls_scores'] = rcnn_cls_scores # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['second_rpn_anchors'] = prediction_dict['anchors'][0][ proposals_order] return prediction_dict def init_weights(self): # submodule init weights self.feature_extractor.init_weights() self.rpn_model.init_weights() Filler.normal_init(self.rcnn_cls_base, 0, 0.01, self.truncated) Filler.normal_init(self.rcnn_bbox_base, 0, 0.001, self.truncated) Filler.normal_init(self.rcnn_top, 0, 0.001, self.truncated) def init_modules(self): self.feature_extractor = FeatureExtractor( self.feature_extractor_config) self.rpn_model = RPNModel(self.rpn_config) self.rcnn_pooling_cls = PSRoIPool(7, 7, 1.0 / 16, 7, self.n_classes) self.rcnn_pooling_loc = PSRoIPool(7, 7, 1.0 / 16, 7, 4) self.rcnn_cls_base = nn.Conv2d( in_channels=1024, out_channels=self.n_classes * self.pooling_size * self.pooling_size, kernel_size=1, stride=1, padding=0, bias=False) self.rcnn_bbox_base = nn.Conv2d( in_channels=1024, out_channels=4 * self.pooling_size * self.pooling_size, kernel_size=1, stride=1, padding=0, bias=False) self.rcnn_top = nn.Conv2d(2048, 1024, 1, 1, 0, bias=False) # self.rcnn_cls_pred = nn.Linear(2048, self.n_classes) # if self.class_agnostic: # self.rcnn_bbox_pred = nn.Linear(2048, 4) # else: # self.rcnn_bbox_pred = nn.Linear(2048, 4 * self.n_classes) # loss module if self.use_focal_loss: self.rcnn_cls_loss = FocalLoss(2) else: self.rcnn_cls_loss = functools.partial( F.cross_entropy, reduce=False) self.rcnn_bbox_loss = nn.modules.SmoothL1Loss(reduce=False) def init_param(self, model_config): classes = model_config['classes'] self.classes = classes self.n_classes = len(classes) self.class_agnostic = model_config['class_agnostic'] self.pooling_size = model_config['pooling_size'] self.pooling_mode = model_config['pooling_mode'] self.crop_resize_with_max_pool = model_config[ 'crop_resize_with_max_pool'] self.truncated = model_config['truncated'] self.use_focal_loss = model_config['use_focal_loss'] self.subsample_twice = model_config['subsample_twice'] self.rcnn_batch_size = model_config['rcnn_batch_size'] # some submodule config self.feature_extractor_config = model_config['feature_extractor_config'] self.rpn_config = model_config['rpn_config'] # assigner self.target_assigner = TargetAssigner( model_config['target_assigner_config']) # sampler self.sampler = BalancedSampler(model_config['sampler_config']) def pre_subsample(self, prediction_dict, feed_dict): rois_batch = prediction_dict['rois_batch'] gt_boxes = feed_dict['gt_boxes'] gt_labels = feed_dict['gt_labels'] ########################## # assigner ########################## # import ipdb # ipdb.set_trace() rcnn_cls_targets, rcnn_reg_targets, rcnn_cls_weights, rcnn_reg_weights = self.target_assigner.assign( rois_batch[:, :, 1:], gt_boxes, gt_labels) ########################## # subsampler ########################## cls_criterion = None pos_indicator = rcnn_cls_targets > 0 indicator = rcnn_cls_weights > 0 # subsample from all # shape (N,M) batch_sampled_mask = self.sampler.subsample_batch( self.rcnn_batch_size, pos_indicator, indicator=indicator, criterion=cls_criterion) rcnn_cls_weights = rcnn_cls_weights[batch_sampled_mask] rcnn_reg_weights = rcnn_reg_weights[batch_sampled_mask] num_cls_coeff = rcnn_cls_weights.type(torch.cuda.ByteTensor).sum( dim=-1) num_reg_coeff = rcnn_reg_weights.type(torch.cuda.ByteTensor).sum( dim=-1) # check assert num_cls_coeff, 'bug happens' assert num_reg_coeff, 'bug happens' prediction_dict[ 'rcnn_cls_weights'] = rcnn_cls_weights / num_cls_coeff.float() prediction_dict[ 'rcnn_reg_weights'] = rcnn_reg_weights / num_reg_coeff.float() prediction_dict['rcnn_cls_targets'] = rcnn_cls_targets[ batch_sampled_mask] prediction_dict['rcnn_reg_targets'] = rcnn_reg_targets[ batch_sampled_mask] # update rois_batch prediction_dict['rois_batch'] = rois_batch[batch_sampled_mask].view( rois_batch.shape[0], -1, 5) if not self.training: # used for track proposals_order = prediction_dict['proposals_order'] prediction_dict['proposals_order'] = proposals_order[ batch_sampled_mask] def loss(self, prediction_dict, feed_dict): """ assign proposals label and subsample from them Then calculate loss """ loss_dict = {} # submodule loss loss_dict.update(self.rpn_model.loss(prediction_dict, feed_dict)) # targets and weights rcnn_cls_weights = prediction_dict['rcnn_cls_weights'] rcnn_reg_weights = prediction_dict['rcnn_reg_weights'] rcnn_cls_targets = prediction_dict['rcnn_cls_targets'] rcnn_reg_targets = prediction_dict['rcnn_reg_targets'] # classification loss rcnn_cls_scores = prediction_dict['rcnn_cls_scores'] rcnn_cls_loss = self.rcnn_cls_loss(rcnn_cls_scores, rcnn_cls_targets) rcnn_cls_loss *= rcnn_cls_weights rcnn_cls_loss = rcnn_cls_loss.sum(dim=-1) # bounding box regression L1 loss rcnn_bbox_preds = prediction_dict['rcnn_bbox_preds'] rcnn_bbox_loss = self.rcnn_bbox_loss(rcnn_bbox_preds, rcnn_reg_targets).sum(dim=-1) rcnn_bbox_loss *= rcnn_reg_weights # rcnn_bbox_loss *= rcnn_reg_weights rcnn_bbox_loss = rcnn_bbox_loss.sum(dim=-1) # loss weights has no gradients loss_dict['rcnn_cls_loss'] = rcnn_cls_loss loss_dict['rcnn_bbox_loss'] = rcnn_bbox_loss # add rcnn_cls_targets to get the statics of rpn loss_dict['rcnn_cls_targets'] = rcnn_cls_targets return loss_dict