def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_points, img_shape, scale_factor, cfg, rescale=False, with_nms=True): """Transform outputs for a single batch item into bbox predictions. Args: cls_scores (list[Tensor]): Box iou-aware scores for a single scale level with shape (num_points * num_classes, H, W). bbox_preds (list[Tensor]): Box offsets for a single scale level with shape (num_points * 4, H, W). mlvl_points (list[Tensor]): Box reference for a single scale level with shape (num_total_points, 4). img_shape (tuple[int]): Shape of the input image, (height, width, 3). scale_factor (ndarray): Scale factor of the image arrange as (w_scale, h_scale, w_scale, h_scale). cfg (mmcv.Config | None): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Default: False. with_nms (bool): If True, do nms before returning boxes. Default: True. Returns: tuple(Tensor): det_bboxes (Tensor): BBox predictions in shape (n, 5), where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. det_labels (Tensor): A (n,) tensor where each item is the predicted class label of the corresponding box. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_bboxes = [] mlvl_scores = [] for cls_score, bbox_pred, points in zip(cls_scores, bbox_preds, mlvl_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).contiguous().sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4).contiguous() nms_pre = cfg.get('nms_pre', -1) if 0 < nms_pre < scores.shape[0]: max_scores, _ = scores.max(dim=1) _, topk_inds = max_scores.topk(nms_pre) points = points[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) if with_nms: det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img) return det_bboxes, det_labels else: return mlvl_bboxes, mlvl_scores
def loss(self, cls_scores, bbox_preds, bbox_preds_refine, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box iou-aware scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box offsets for each scale level, each is a 4D-tensor, the channel number is num_points * 4. bbox_preds_refine (list[Tensor]): Refined Box offsets for each scale level, each is a 4D-tensor, the channel number is num_points * 4. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Default: None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, label_weights, bbox_targets, bbox_weights = self.get_targets( cls_scores, all_level_points, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and bbox_preds_refine flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() for bbox_pred in bbox_preds ] flatten_bbox_preds_refine = [ bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() for bbox_pred_refine in bbox_preds_refine ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = torch.where( ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0] num_pos = len(pos_inds) pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds] pos_labels = flatten_labels[pos_inds] # sync num_pos across all gpus if self.sync_num_pos: num_pos_avg_per_gpu = reduce_mean( pos_inds.new_tensor(num_pos).float()).item() num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0) else: num_pos_avg_per_gpu = num_pos if num_pos > 0: pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_points = flatten_points[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets) iou_targets_ini = bbox_overlaps(pos_decoded_bbox_preds, pos_decoded_target_preds.detach(), is_aligned=True).clamp(min=1e-6) bbox_weights_ini = iou_targets_ini.clone().detach() iou_targets_ini_avg_per_gpu = reduce_mean( bbox_weights_ini.sum()).item() bbox_avg_factor_ini = max(iou_targets_ini_avg_per_gpu, 1.0) loss_bbox = self.loss_bbox(pos_decoded_bbox_preds, pos_decoded_target_preds.detach(), weight=bbox_weights_ini, avg_factor=bbox_avg_factor_ini) pos_decoded_bbox_preds_refine = \ distance2bbox(pos_points, pos_bbox_preds_refine) iou_targets_rf = bbox_overlaps(pos_decoded_bbox_preds_refine, pos_decoded_target_preds.detach(), is_aligned=True).clamp(min=1e-6) bbox_weights_rf = iou_targets_rf.clone().detach() iou_targets_rf_avg_per_gpu = reduce_mean( bbox_weights_rf.sum()).item() bbox_avg_factor_rf = max(iou_targets_rf_avg_per_gpu, 1.0) loss_bbox_refine = self.loss_bbox_refine( pos_decoded_bbox_preds_refine, pos_decoded_target_preds.detach(), weight=bbox_weights_rf, avg_factor=bbox_avg_factor_rf) # build IoU-aware cls_score targets if self.use_vfl: pos_ious = iou_targets_rf.clone().detach() cls_iou_targets = torch.zeros_like(flatten_cls_scores) cls_iou_targets[pos_inds, pos_labels] = pos_ious else: loss_bbox = pos_bbox_preds.sum() * 0 loss_bbox_refine = pos_bbox_preds_refine.sum() * 0 if self.use_vfl: cls_iou_targets = torch.zeros_like(flatten_cls_scores) if self.use_vfl: loss_cls = self.loss_cls(flatten_cls_scores, cls_iou_targets, avg_factor=num_pos_avg_per_gpu) else: loss_cls = self.loss_cls(flatten_cls_scores, flatten_labels, weight=label_weights, avg_factor=num_pos_avg_per_gpu) return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_bbox_rf=loss_bbox_refine)
def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights, bbox_targets, stride, num_total_samples): """Compute loss of a single scale level. Args: anchors (Tensor): Box reference for each scale level with shape (N, num_total_anchors, 4). cls_score (Tensor): Cls and quality joint scores for each scale level has shape (N, num_classes, H, W). bbox_pred (Tensor): Box distribution logits for each scale level with shape (N, 4*(n+1), H, W), n is max value of integral set. labels (Tensor): Labels of each anchors with shape (N, num_total_anchors). label_weights (Tensor): Label weights of each anchor with shape (N, num_total_anchors) bbox_targets (Tensor): BBox regression targets of each anchor wight shape (N, num_total_anchors, 4). stride (tuple): Stride in this scale level. num_total_samples (int): Number of positive samples that is reduced over all GPUs. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert stride[0] == stride[1], 'h stride is not equal to w stride!' anchors = anchors.reshape(-1, 4) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4 * (self.reg_max + 1)) bbox_targets = bbox_targets.reshape(-1, 4) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) score = label_weights.new_zeros(labels.shape) if len(pos_inds) > 0: pos_bbox_targets = bbox_targets[pos_inds] # gt bbox值,原图尺度 pos_bbox_pred = bbox_pred[pos_inds] # 分布形式输出值,特征图维度 pos_anchors = anchors[pos_inds] # 正方形anchor,原图尺度 # 正样本特征图上面点坐标,特征图尺度 pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0] weight_targets = cls_score.detach().sigmoid() # 预测类别所对应的输出分值 weight_targets = weight_targets.max(dim=1)[0][pos_inds] # 转换为浮点坐标,代表距离4条边的距离,注意是特征图维度值,不是原图维度值,其实就是相差一个stride参数而已 pos_bbox_pred_corners = self.integral(pos_bbox_pred) # 还原得到bbox坐标,特征图维度 pos_decode_bbox_pred = distance2bbox(pos_anchor_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride[0] # 原图尺度转换为特征图尺度 # 计算iou,is_aligned=True表示bbox1和bbox2维度相同 score[pos_inds] = bbox_overlaps( pos_decode_bbox_pred.detach(), pos_decode_bbox_targets, is_aligned=True) pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1) # gt bbox编码,变成距离4条边的距离,特征图维度 target_corners = bbox2distance(pos_anchor_centers, pos_decode_bbox_targets, self.reg_max).reshape(-1) # regression loss loss_bbox = self.loss_bbox( pos_decode_bbox_pred, pos_decode_bbox_targets, weight=weight_targets, # 注意,将分类分值作为权重,进一步加强一致性 avg_factor=1.0) # dfl loss 注意,将分类分值作为权重,进一步加强一致性 loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets[:, None].expand(-1, 4).reshape(-1), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = torch.tensor(0).cuda() # cls (qfl) loss 注意要score loss_cls = self.loss_cls( cls_score, (labels, score), weight=label_weights, avg_factor=num_total_samples) return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_anchors, img_shape, scale_factor, cfg, rescale=False, with_nms=True): """Transform outputs for a single batch item into labeled boxes. Args: cls_scores (list[Tensor]): Box scores for a single scale level has shape (num_classes, H, W). bbox_preds (list[Tensor]): Box distribution logits for a single scale level with shape (4*(n+1), H, W), n is max value of integral set. mlvl_anchors (list[Tensor]): Box reference for a single scale level with shape (num_total_anchors, 4). img_shape (tuple[int]): Shape of the input image, (height, width, 3). scale_factor (ndarray): Scale factor of the image arange as (w_scale, h_scale, w_scale, h_scale). cfg (mmcv.Config | None): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Default: False. with_nms (bool): If True, do nms before return boxes. Default: True. Returns: tuple(Tensor): det_bboxes (Tensor): Bbox predictions in shape (N, 5), where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. det_labels (Tensor): A (N,) tensor where each item is the predicted class label of the corresponding box. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] for cls_score, bbox_pred, stride, anchors in zip( cls_scores, bbox_preds, self.anchor_generator.strides, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] assert stride[0] == stride[1] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0) # 输出是特征图维度 # 需要乘以stride,变成原图维度 bbox_pred = self.integral(bbox_pred) * stride[0] nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: max_scores, _ = scores.max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] bboxes = distance2bbox( self.anchor_center(anchors), bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) # Add a dummy background class to the backend when using sigmoid # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) if with_nms: det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img) return det_bboxes, det_labels else: return mlvl_bboxes, mlvl_scores
def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): '''''' """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. [batchsize,80,H_i,W_i] bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * 4. [batchsize,4,H_i,W_i] centernesses (list[Tensor]): Centerss for each scale level, each is a 4D-tensor, the channel number is num_points * 1. [batchsize,1,H_i,W_i] gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. [batchsize][num_obj,4] gt_labels (list[Tensor]): class indices corresponding to each box [batchsize][num_obj] img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. [batchsize][(dict)dict_keys(['filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'img_norm_cfg'])] gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(centernesses) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] # P3-P7特征图的大小 ''' [ torch.Size([100, 152]), torch.Size([50, 76]), torch.Size([25, 38]), torch.Size([13, 19]), torch.Size([7, 10]) ] 特征图的大小就相当于把原图分为多大的grid,特征图每个像素映射到原图就是该grid的中心点,不同大小的特征图就有不同的grid # bbox_preds[0].dtype:torch.float32 # all_level_points:(list) [5][n_points][2] ''' all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes, gt_labels) if self.debug: is_upsample = False # 是否采用上采样可视化模式 circle_ratio = [2, 4, 6, 8, 8] # 可视化正样本区域 # 遍历每一张图片 batch_size = cls_scores[0].shape[0] for i in range(batch_size): gt_bbox = gt_bboxes[i] img_meta = img_metas[i] img = img_meta['img'].data.numpy() mean = img_meta['img_norm_cfg']['mean'] std = img_meta['img_norm_cfg']['std'] # # 输入是bgr数据 img = np.transpose(img.copy(), (1, 2, 0)) img = img * std.reshape([1, 1, 3]) + mean.reshape([1, 1, 3]) img = img.astype(np.uint8) img = cv_core.show_bbox(img, gt_bbox.cpu().numpy(), is_show=False, thickness=2, color=(255, 255, 255)) # 遍历每一个输出层 disp_img = [] for j in range(len(labels)): bbox_target = bbox_targets[j] bbox_target = bbox_target[i * len(bbox_target) // batch_size:(i + 1) * len(bbox_target) // batch_size] * self.strides[j] label = labels[j] level_label = label[i * len(label) // batch_size:(i + 1) * len(label) // batch_size] print(bbox_target[(level_label >= 0) & (level_label < self.num_classes)]) level_label = level_label.view(featmap_sizes[j][0], featmap_sizes[j][1]) # 得到正样本位置 pos_mask = (level_label >= 0) & (level_label < self.num_classes) if pos_mask.is_cuda: pos_mask = pos_mask.cpu() pos_mask = pos_mask.data.numpy() # 特征图还原到原图尺寸 if is_upsample: # 先利用stride还原 pos_mask = (pos_mask * 255).astype(np.uint8) pos_mask = cv_core.imrescale(pos_mask, self.strides[j], interpolation="nearest") # 然后裁剪 resize_pos_mask = pos_mask[0:img.shape[0], 0:img.shape[1]] # 转化bgr,方便add resize_pos_mask = cv_core.gray2bgr(resize_pos_mask) img_ = cv2.addWeighted(img, 0.5, resize_pos_mask, 0.5, 0) else: pos_mask = pos_mask.astype(np.uint8) index = pos_mask.nonzero() index_yx = np.stack(index, axis=1) # 还原到原图尺度 pos_img_yx = index_yx * self.strides[j] + self.strides[ j] // 2 img_ = img.copy() # 圆形模式 for z in range(pos_img_yx.shape[0]): point = (int(pos_img_yx[z, 1]), int(pos_img_yx[z, 0])) cv2.circle(img_, point, 1, (0, 0, 255), circle_ratio[j]) # 点模式 # img_[pos_img_yx[:, 0], pos_img_yx[:, 1], :] = (0, 255, 0) disp_img.append(img_) cv_core.show_img(disp_img) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and centerness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) for bbox_pred in bbox_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_centerness = torch.cat(flatten_centerness) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)).nonzero().reshape(-1) num_pos = len(pos_inds) loss_cls = self.loss_cls(flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs) # avoid num_pos is 0 pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_centerness = flatten_centerness[pos_inds] if num_pos > 0: pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_centerness_targets = self.centerness_target(pos_bbox_targets) pos_points = flatten_points[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets) # centerness weighted iou loss loss_bbox = self.loss_bbox(pos_decoded_bbox_preds, pos_decoded_target_preds, weight=pos_centerness_targets, avg_factor=pos_centerness_targets.sum()) loss_centerness = self.loss_centerness(pos_centerness, pos_centerness_targets) else: loss_bbox = pos_bbox_preds.sum() loss_centerness = pos_centerness.sum() return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_centerness=loss_centerness)