def get_bboxes(self, cls_preds, reg_preds, img_metas): """Decode the outputs to bboxes. Args: cls_preds (Tensor): Shape (num_imgs, num_points, num_classes). reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)). img_metas (dict): Dict of image info. Returns: results_list (list[tuple]): List of detection bboxes and labels. """ device = cls_preds.device b = cls_preds.shape[0] input_height, input_width = img_metas["img"].shape[2:] input_shape = (input_height, input_width) featmap_sizes = [(math.ceil(input_height / stride), math.ceil(input_width) / stride) for stride in self.strides] # get grid cells of one image mlvl_center_priors = [ self.get_single_level_center_priors( b, featmap_sizes[i], stride, dtype=torch.float32, device=device, ) for i, stride in enumerate(self.strides) ] center_priors = torch.cat(mlvl_center_priors, dim=1) dis_preds = self.distribution_project(reg_preds) * center_priors[..., 2, None] bboxes = distance2bbox(center_priors[..., :2], dis_preds, max_shape=input_shape) scores = cls_preds.sigmoid() result_list = [] for i in range(b): # add a dummy background class at the end of all labels # same with mmdetection2.0 score, bbox = scores[i], bboxes[i] padding = score.new_zeros(score.shape[0], 1) score = torch.cat([score, padding], dim=1) results = multiclass_nms( bbox, score, score_thr=0.05, nms_cfg=dict(type="nms", iou_threshold=0.6), max_num=100, ) result_list.append(results) return result_list
def get_bboxes_single( self, cls_scores, bbox_preds, mlvl_anchors, img_shape, # input shape!!!! scale_factor, rescale=False): assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] for stride, cls_score, bbox_pred, anchors in zip( self.anchor_strides, cls_scores, bbox_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0) bbox_pred = self.distribution_project(bbox_pred) * stride # nms_pre = cfg.get('nms_pre', -1) nms_pre = 1000 if nms_pre > 0 and scores.shape[0] > nms_pre: max_scores, _ = scores.max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] bboxes = distance2bbox(self.anchor_center(anchors), bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) # add a dummy background class at the end of all labels, same with mmdetection2.0 padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, score_thr=0.05, nms_cfg=dict( type='nms', iou_threshold=0.6), max_num=100) return det_bboxes, det_labels
def get_bboxes_single(self, cls_scores, bbox_preds, img_shape, scale_factor, device, rescale=False): """ Decode output tensors to bboxes on one image. :param cls_scores: classification prediction tensors of all stages :param bbox_preds: regression prediction tensors of all stages :param img_shape: shape of input image :param scale_factor: scale factor of boxes :param device: device of the tensor :return: predict boxes and labels """ assert len(cls_scores) == len(bbox_preds) mlvl_bboxes = [] mlvl_scores = [] for stride, cls_score, bbox_pred in zip(self.strides, cls_scores, bbox_preds): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] featmap_size = cls_score.size()[-2:] y, x = self.get_single_level_center_point(featmap_size, stride, cls_score.dtype, device, flatten=True) center_points = torch.stack([x, y], dim=-1) scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0) bbox_pred = self.distribution_project(bbox_pred) * stride nms_pre = 1000 if scores.shape[0] > nms_pre: max_scores, _ = scores.max(dim=1) _, topk_inds = max_scores.topk(nms_pre) center_points = center_points[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] bboxes = distance2bbox(center_points, bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) # add a dummy background class at the end of all labels, same with mmdetection2.0 padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, score_thr=0.05, nms_cfg=dict( type='nms', iou_threshold=0.6), max_num=100) return det_bboxes, det_labels
def loss_single(self, grid_cells, cls_score, bbox_pred, labels, label_weights, bbox_targets, stride, num_total_samples): grid_cells = grid_cells.reshape(-1, 4) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4 * (self.reg_max + 1)) bbox_targets = bbox_targets.reshape(-1, 4) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = torch.nonzero( (labels >= 0) & (labels < bg_class_ind), as_tuple=False).squeeze(1) # 找到标签大于0且标签小于类数目的索引 score = label_weights.new_zeros(labels.shape) if len(pos_inds) > 0: pos_bbox_targets = bbox_targets[pos_inds] pos_bbox_pred = bbox_pred[pos_inds] # (n, 4 * (reg_max + 1)) pos_grid_cells = grid_cells[pos_inds] pos_grid_cell_centers = self.grid_cells_to_center( pos_grid_cells) / stride weight_targets = cls_score.detach().sigmoid() weight_targets = weight_targets.max(dim=1)[0][pos_inds] pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride score[pos_inds] = bbox_overlaps(pos_decode_bbox_pred.detach(), pos_decode_bbox_targets, is_aligned=True) pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1) target_corners = bbox2distance(pos_grid_cell_centers, pos_decode_bbox_targets, self.reg_max).reshape(-1) # regression loss loss_bbox = self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets, weight=weight_targets, avg_factor=1.0) # dfl loss loss_dfl = self.loss_dfl(pred_corners, target_corners, weight=weight_targets[:, None].expand( -1, 4).reshape(-1), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = torch.tensor(0).to(cls_score.device) # qfl loss loss_qfl = self.loss_qfl(cls_score, (labels, score), weight=label_weights, avg_factor=num_total_samples) return loss_qfl, loss_bbox, loss_dfl, weight_targets.sum()
def loss(self, preds, gt_meta, aux_preds=None): """Compute losses. Args: preds (Tensor): Prediction output. gt_meta (dict): Ground truth information. aux_preds (tuple[Tensor], optional): Auxiliary head prediction output. Returns: loss (Tensor): Loss tensor. loss_states (dict): State dict of each loss. """ gt_bboxes = gt_meta["gt_bboxes"] gt_labels = gt_meta["gt_labels"] device = preds.device batch_size = preds.shape[0] input_height, input_width = gt_meta["img"].shape[2:] featmap_sizes = [(math.ceil(input_height / stride), math.ceil(input_width) / stride) for stride in self.strides] # get grid cells of one image mlvl_center_priors = [ self.get_single_level_center_priors( batch_size, featmap_sizes[i], stride, dtype=torch.float32, device=device, ) for i, stride in enumerate(self.strides) ] center_priors = torch.cat(mlvl_center_priors, dim=1) cls_preds, reg_preds = preds.split( [self.num_classes, 4 * (self.reg_max + 1)], dim=-1) dis_preds = self.distribution_project(reg_preds) * center_priors[..., 2, None] decoded_bboxes = distance2bbox(center_priors[..., :2], dis_preds) if aux_preds is not None: # use auxiliary head to assign aux_cls_preds, aux_reg_preds = aux_preds.split( [self.num_classes, 4 * (self.reg_max + 1)], dim=-1) aux_dis_preds = (self.distribution_project(aux_reg_preds) * center_priors[..., 2, None]) aux_decoded_bboxes = distance2bbox(center_priors[..., :2], aux_dis_preds) batch_assign_res = multi_apply( self.target_assign_single_img, aux_cls_preds.detach(), center_priors, aux_decoded_bboxes.detach(), gt_bboxes, gt_labels, ) else: # use self prediction to assign batch_assign_res = multi_apply( self.target_assign_single_img, cls_preds.detach(), center_priors, decoded_bboxes.detach(), gt_bboxes, gt_labels, ) loss, loss_states = self._get_loss_from_assign(cls_preds, reg_preds, decoded_bboxes, batch_assign_res) if aux_preds is not None: aux_loss, aux_loss_states = self._get_loss_from_assign( aux_cls_preds, aux_reg_preds, aux_decoded_bboxes, batch_assign_res) loss = loss + aux_loss for k, v in aux_loss_states.items(): loss_states["aux_" + k] = v return loss, loss_states