def kmeans_anchors(self): self.logger.info( f'Start cluster {self.num_anchors} YOLO anchors with K-means...') bboxes = self.get_zero_center_bbox_tensor() cluster_center_idx = torch.randint( 0, bboxes.shape[0], (self.num_anchors, )).to(self.device) assignments = torch.zeros((bboxes.shape[0], )).to(self.device) cluster_centers = bboxes[cluster_center_idx] if self.num_anchors == 1: cluster_centers = self.kmeans_maximization(bboxes, assignments, cluster_centers) anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() anchors = sorted(anchors, key=lambda x: x[0] * x[1]) return anchors prog_bar = mmcv.ProgressBar(self.iters) for i in range(self.iters): converged, assignments = self.kmeans_expectation( bboxes, assignments, cluster_centers) if converged: self.logger.info(f'K-means process has converged at iter {i}.') break cluster_centers = self.kmeans_maximization(bboxes, assignments, cluster_centers) prog_bar.update() print('\n') avg_iou = bbox_overlaps(bboxes, cluster_centers).max(1)[0].mean().item() anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() anchors = sorted(anchors, key=lambda x: x[0] * x[1]) self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}') return anchors
def _get_target_single(self, cls_score, bbox_pred, gt_bboxes, gt_labels, img_meta, gt_bboxes_ignore=None): """"Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: cls_score (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from a single decoder layer for one image, with normalized coordinate (cx, cy, w, h) and shape [num_query, 4]. gt_bboxes (Tensor): Ground truth bboxes for one image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (Tensor): Ground truth class indices for one image with shape (num_gts, ). img_meta (dict): Meta information for one image. gt_bboxes_ignore (Tensor, optional): Bounding boxes which can be ignored. Default None. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor]): Label weights of each image. - bbox_targets (Tensor): BBox targets of each image. - bbox_weights (Tensor): BBox weights of each image. - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ num_bboxes = bbox_pred.size(0) # assigner and sampler assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, gt_labels, img_meta, gt_bboxes_ignore) sampling_result = self.sampler.sample(assign_result, bbox_pred, gt_bboxes) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label targets labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long) labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] label_weights = gt_bboxes.new_ones(num_bboxes) # bbox targets bbox_targets = torch.zeros_like(bbox_pred) bbox_weights = torch.zeros_like(bbox_pred) bbox_weights[pos_inds] = 1.0 img_h, img_w, _ = img_meta['img_shape'] # DETR regress the relative position of boxes (cxcywh) in the image. # Thus the learning target should be normalized by the image size, also # the box format should be converted from defaultly x1y1x2y2 to cxcywh. factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) bbox_targets[pos_inds] = pos_gt_bboxes_targets return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)
def forward_train(self, x, proposal_boxes, proposal_features, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, imgs_whwh=None, gt_masks=None): """Forward function in training stage. Args: x (list[Tensor]): list of multi-level img features. proposals (Tensor): Decoded proposal bboxes, has shape (batch_size, num_proposals, 4) proposal_features (Tensor): Expanded proposal features, has shape (batch_size, num_proposals, proposal_feature_channel) img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. imgs_whwh (Tensor): Tensor with shape (batch_size, 4), the dimension means [img_width,img_height, img_width, img_height]. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. Returns: dict[str, Tensor]: a dictionary of loss components of all stage. """ num_imgs = len(img_metas) num_proposals = proposal_boxes.size(1) imgs_whwh = imgs_whwh.repeat(1, num_proposals, 1) all_stage_bbox_results = [] proposal_list = [proposal_boxes[i] for i in range(len(proposal_boxes))] object_feats = proposal_features all_stage_loss = {} for stage in range(self.num_stages): rois = bbox2roi(proposal_list) bbox_results = self._bbox_forward(stage, x, rois, object_feats, img_metas) all_stage_bbox_results.append(bbox_results) if gt_bboxes_ignore is None: # TODO support ignore gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] cls_pred_list = bbox_results['detach_cls_score_list'] proposal_list = bbox_results['detach_proposal_list'] for i in range(num_imgs): normalize_bbox_ccwh = bbox_xyxy_to_cxcywh(proposal_list[i] / imgs_whwh[i]) assign_result = self.bbox_assigner[stage].assign( normalize_bbox_ccwh, cls_pred_list[i], gt_bboxes[i], gt_labels[i], img_metas[i]) sampling_result = self.bbox_sampler[stage].sample( assign_result, proposal_list[i], gt_bboxes[i]) sampling_results.append(sampling_result) bbox_targets = self.bbox_head[stage].get_targets( sampling_results, gt_bboxes, gt_labels, self.train_cfg[stage], True) cls_score = bbox_results['cls_score'] decode_bbox_pred = bbox_results['decode_bbox_pred'] single_stage_loss = self.bbox_head[stage].loss( cls_score.view(-1, cls_score.size(-1)), decode_bbox_pred.view(-1, 4), *bbox_targets, imgs_whwh=imgs_whwh) for key, value in single_stage_loss.items(): all_stage_loss[f'stage{stage}_{key}'] = value * \ self.stage_loss_weights[stage] object_feats = bbox_results['object_feats'] return all_stage_loss
def _get_l1_target(self, l1_target, gt_bboxes, priors, eps=1e-8): """Convert gt bboxes to center offset and log width height.""" gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:] l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) return l1_target