def _first_stage(self, imgs: torch.Tensor): with EvalScope(self.pNet): _, c, h, w = imgs.shape scale = 12.0 / self.minSize # This is initial scale min_l = min(h, w) b, s, i = [], [], [] while min_l * scale >= 12.: imgs = _nnf.interpolate(imgs, size=[int(h * scale), int(w * scale)], mode='area') reg, pro = self.pNet(imgs) pro = pro[:, 1] strd = 2. / scale cell = 12. / scale msk = torch.ge(pro, self.pNetThreshold) # b, h, w if msk.any(): indices = msk.nonzero() # n, 3 <- (i, y, x) idx, r, c = indices[:, 0], indices[:, 1], indices[:, 2] pro = pro[msk] reg = reg.permute(0, 2, 3, 1) # b, h, w, c <- (x1^, y1^, x2^, y2^) reg = reg[msk] x1, y1 = c * strd, r * strd x2, y2 = x1 + cell, y1 + cell bbs = torch.dstack([x1, y1, x2, y2]).squeeze(0) bbs = self._bb_reg(bbs, reg) nms_idx = batched_nms(bbs, pro, idx, self.nmsThreshold) b.append(bbs[nms_idx]) s.append(pro[nms_idx]) i.append(idx[nms_idx]) scale = scale * self.factor if len(b) > 0: b = torch.cat(b, dim=0) s = torch.cat(s, dim=0) i = torch.cat(i, dim=0) nms_idx = batched_nms(b, s, i, self.nmsThreshold) b = clip_boxes_to_image(b[nms_idx], size=(w, h)).int() i = i[nms_idx] return b, i else: return None
def test_batched_nms_implementations(self): """Make sure that both implementations of batched_nms yield identical results""" num_boxes = 1000 iou_threshold = .9 boxes = torch.cat( (torch.rand(num_boxes, 2), torch.rand(num_boxes, 2) + 10), dim=1) assert max(boxes[:, 0]) < min(boxes[:, 2]) # x1 < x2 assert max(boxes[:, 1]) < min(boxes[:, 3]) # y1 < y2 scores = torch.rand(num_boxes) idxs = torch.randint(0, 4, size=(num_boxes, )) keep_vanilla = ops.boxes._batched_nms_vanilla(boxes, scores, idxs, iou_threshold) keep_trick = ops.boxes._batched_nms_coordinate_trick( boxes, scores, idxs, iou_threshold) err_msg = "The vanilla and the trick implementation yield different nms outputs." self.assertTrue(torch.allclose(keep_vanilla, keep_trick), err_msg) # Also make sure an empty tensor is returned if boxes is empty empty = torch.empty((0, ), dtype=torch.int64) self.assertTrue( torch.allclose(empty, ops.batched_nms(empty, None, None, None)))
def forward( self, head_outputs: List[Tensor], anchors_tuple: Tuple[Tensor, Tensor, Tensor], ) -> List[Dict[str, Tensor]]: """ Perform the computation. At test time, postprocess_detections is the final layer of YOLO. Decode location preds, apply non-maximum suppression to location predictions based on conf scores and threshold to a detections_per_img number of output predictions for both confidence score and locations. Parameters: head_outputs : [batch_size, num_anchors, num_classes + 5] predicted locations and class/object confidence. image_shapes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ batch_size, _, _, _, K = head_outputs[0].shape all_pred_logits: List[Tensor] = [] for pred_logits in head_outputs: pred_logits = pred_logits.reshape(batch_size, -1, K) # Size=(NN, HWA, K) all_pred_logits.append(pred_logits) all_pred_logits = torch.cat(all_pred_logits, dim=1) detections: List[Dict[str, Tensor]] = [] for idx in range(batch_size): # image idx, image inference pred_logits = torch.sigmoid(all_pred_logits[idx]) # Compute conf # box_conf x class_conf, w/ shape: num_anchors x num_classes scores = pred_logits[:, 5:] * pred_logits[:, 4:5] boxes = self.box_coder.decode_single(pred_logits[:, :4], anchors_tuple) # remove low scoring boxes inds, labels = torch.where(scores > self.score_thresh) boxes, scores = boxes[inds], scores[inds, labels] # non-maximum suppression, independently done per level keep = batched_nms(boxes, scores, labels, self.nms_thresh) # keep only topk scoring head_outputs keep = keep[:self.detections_per_img] boxes, scores, labels = boxes[keep], scores[keep], labels[keep] detections.append({ 'scores': scores, 'labels': labels, 'boxes': boxes }) return detections
def _apply_nms(final_boxes: Tensor, final_batch_idx: Tensor, nms_threshold: float, num_classes: int) -> Tuple[Tensor, Tensor]: coords = final_boxes[..., :4] final_boxes[..., -3, None] scaled_score = final_boxes[..., -2, None] class_id = final_boxes[..., -1, None] # torchvision NMS cant do batches of images, but it can separate based on class id # create a new "class id" that distinguishes batch and class idx = (final_batch_idx * num_classes + class_id.view_as(final_batch_idx)).view(-1).long() keep = batched_nms(coords.float(), scaled_score.view(-1), idx, nms_threshold) final_boxes = final_boxes[keep, :] final_batch_idx = final_batch_idx[keep, :] return final_boxes.contiguous(), final_batch_idx.contiguous()
def run(self, image_metas: list, pred_bboxes: torch.Tensor, pred_scores: torch.Tensor, pred_class_ids: torch.Tensor) -> list: """ 予測結果から条件を満たすものを抽出し、結果の辞書のリストを作成 予測結果 -> 信頼度でフィルタ -> NMS でフィルタ -> 最終予測結果 Args: image_metas (list): 画像メタデータ pred_bboxes (torch.Tensor): 予測 BBox [N, num_preds, 4] (coord fmt: [xmin, ymin, xmax, ymax]) pred_scores (torch.Tensor): 予測信頼度 [N, num_preds] pred_class_ids (torch.Tensor): 予測クラス ID [N, num_preds] Returns: list: 最終予測結果 """ result = [] for image_meta, bboxes, scores, class_ids in zip(image_metas, pred_bboxes, pred_scores, pred_class_ids): # 重複の除去(non-maximum supression) keep = batched_nms(bboxes, scores, class_ids, iou_threshold=self.iou_thresh) bboxes = bboxes[keep] scores = scores[keep] class_ids = class_ids[keep] H, W = image_meta['height'], image_meta['width'] for (xmin, ymin, xmax, ymax), score, class_id in zip(bboxes, scores, class_ids): res = { 'image_id': image_meta['image_id'], 'category_id': class_id.item(), 'bbox': [ xmin.item() * W, ymin.item() * H, (xmax - xmin).item() * W, (ymax - ymin).item() * H ], 'score': score.item(), } result.append(res) # if self.out_dir: # mean = torch.tensor(image_meta['norm_mean']).reshape(3, 1, 1) # std = torch.tensor(image_meta['norm_std']).reshape(3, 1, 1) # image = image * std + mean # image = self._to_pil_image(image, size=(W, H)) # image = self._draw_bbox(image, result) # self._save(image, image_meta['image_id']) return result
def _third_stage( self, imgs: torch.Tensor, r_bbs: torch.Tensor, r_idxs: torch.Tensor ) -> Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: _imgs = self._gather_rois(imgs, r_bbs, r_idxs, 48) with EvalScope(self.oNet): reg, lmk, pro = self.oNet(_imgs) mask = torch.ge(pro[:, 1], self.oNetThreshold) if not mask.any(): return None reg = reg[mask] pro = pro[:, 1][mask] b = r_bbs[mask].type(torch.float32) i = r_idxs[mask] b = self._bb_reg(b, reg) j = batched_nms(b, pro, i, self.nmsThreshold) b = clip_boxes_to_image(b[j], size=imgs.shape[2:]).int() i = i[j] return b, i, lmk[j]
def forward(self, boxes, scores, idxs): return ops.batched_nms(boxes, scores, idxs, 0.5)
img_preds = prediction[0] for i in range(len(img_preds["boxes"])): x1, y1, x2, y2 = img_preds["boxes"][i] label = int(img_preds["labels"][i]) score = float(img_preds["scores"][i]) draw.rectangle(((x1, y1), (x2, y2)), outline="red") text = f'{dataset.cat2name[label]}: {score}' draw.text((x1+5, y1+5), text) display(pred_img) ### Post Processing ################################################# img_preds = prediction[0] keep_idx = batched_nms(boxes=img_preds["boxes"], scores=img_preds["scores"], idxs=img_preds["labels"], iou_threshold=params['IOU_THRESHOLD']) # convert the image, which has been rescaled to 0-1 and had the channels flipped pred_img = Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy()) draw = ImageDraw.Draw(pred_img) for i in range(len(img_preds["boxes"])): if i in keep_idx: x1, y1, x2, y2 = img_preds["boxes"][i] label = int(img_preds["labels"][i]) score = float(img_preds["scores"][i]) draw.rectangle(((x1, y1), (x2, y2)), outline="red") text = f'{dataset.cat2name[label]}: {score}' draw.text((x1+5, y1+5), text)
def val_one_epoch(model, data_loader, coco_gt, dist_logger, epoch_idx, nms_cfg): pred_instances = [] nms_pre, cls_score_thr, iou_thr = nms_cfg['nms_pre'], nms_cfg[ 'cls_score_thr'], nms_cfg['iou_thr'] model.eval() processor = dist_logger.init_processor(data_loader) for img, data in processor: img = img.cuda(non_blocking=True) points = data['points'].cuda(non_blocking=True) img_info_list = coco_gt.loadImgs(data['img_id'].numpy()) class_pred, distance_pred, centerness_pred = model(img) class_pred = class_pred.sigmoid() # [B, num_points, num_classes] cls_pred_scores, cls_pred_indexes = class_pred.max( dim=-1) # [B, num_points] bbox_pred = bbox_ops.convert_distance_to_bbox( points, distance_pred) # [B, num_points, 4] centerness_pred = centerness_pred.sigmoid() # [B, num_points] batch_size, _, num_classes = class_pred.shape _, _, ih, iw = img.shape for batch_idx in range(batch_size): b_cls_pred_scores, b_cls_pred_indexes, b_centerness_pred = cls_pred_scores[ batch_idx], cls_pred_indexes[batch_idx], centerness_pred[ batch_idx] # [num_points] b_bbox_pred = bbox_pred[batch_idx, :] # [num_points, 4] _, top_idx = (b_cls_pred_scores * b_centerness_pred).topk( nms_pre) # [topk] top_class_pred_scores, top_class_pred_indexes, top_centerness_pred = b_cls_pred_scores[ top_idx], b_cls_pred_indexes[top_idx], b_centerness_pred[ top_idx] # [topk] nms_scores = top_class_pred_scores * top_centerness_pred # [topk] top_bbox_pred = b_bbox_pred[top_idx, :] # [topk, 4] top_bbox_pred = cv_ops.clip_boxes_to_image(top_bbox_pred, size=(ih, iw)) valid_mask = top_class_pred_scores > cls_score_thr valid_class_pred_scores, valid_class_pred_indexes, valid_nms_scores = top_class_pred_scores[ valid_mask], top_class_pred_indexes[valid_mask], nms_scores[ valid_mask] valid_bbox_pred = top_bbox_pred[valid_mask, :] keep_idx = cv_ops.batched_nms(valid_bbox_pred, valid_nms_scores, valid_class_pred_indexes, iou_thr) keep_class_pred_scores, keep_class_pred_indexes = valid_class_pred_scores[ keep_idx], valid_class_pred_indexes[keep_idx] keep_bbox_pred = valid_bbox_pred[keep_idx, :] oh, ow = img_info_list[batch_idx]['height'], img_info_list[ batch_idx]['width'] keep_bbox_pred = bbox_ops.recover_bboxes(keep_bbox_pred, oh, ow, ih, iw) keep_bbox_pred = cv_ops.box_convert(keep_bbox_pred, in_fmt='xyxy', out_fmt='xywh') for cls_score, cls_idx, bbox in zip(keep_class_pred_scores, keep_class_pred_indexes, keep_bbox_pred): pred_instances.append({ 'image_id': int(data['img_id'][batch_idx]), 'category_id': int(cls_idx) + 1, 'bbox': [float(str('%.1f' % coord)) for coord in bbox.tolist()], 'score': float(str('%.1f' % cls_score)) }) dist_logger.save_pred_instances_local_rank(pred_instances) dist_logger.save_val_file() dist_logger.update_tensorboard_val_results(coco_gt, epoch_idx)
def val_one_epoch(model, data_loader, coco_gt, dist_logger, epoch_idx, nms_cfg): pred_instances = [] nms_pre, cls_score_thr, iou_thr = nms_cfg['nms_pre'], nms_cfg['cls_score_thr'], nms_cfg['iou_thr'] _, _, label_to_cat_map = tools.get_cat_label_map(coco_gt, tools.COCO_CLASSES) # print(label_to_cat_map) model.eval() processor = tqdm.tqdm(data_loader, disable=not dist_logger.is_master_rank) for img, points, img_ids in processor: img = img.cuda(non_blocking=True) points = points.cuda(non_blocking=True) img_info_list = coco_gt.loadImgs(img_ids.numpy()) pred = model(img, points) class_pred = pred['class'].sigmoid() # [B, num_points, num_classes] centerness_pred = pred['centerness'].sigmoid() # [B, num_points] bbox_pred = bbox_ops.convert_distance_to_bbox(points, pred['distance']) # [B, num_points, 4] # instance_mask_pred = pred['instance_mask'].sigmoid() # [B, num_points, pooler_size, pooler_size] # print(class_pred.shape, centerness_pred.shape, bbox_pred.shape, instance_mask_pred.shape) # exit(-1) cls_pred_scores, cls_pred_indexes = class_pred.max(dim=-1) # [B, num_points] batch_size, _, num_classes = class_pred.shape _, _, ih, iw = img.shape for batch_idx in range(batch_size): b_cls_pred_scores = cls_pred_scores[batch_idx] b_cls_pred_indexes = cls_pred_indexes[batch_idx] b_centerness_pred = centerness_pred[batch_idx] b_bbox_pred = bbox_pred[batch_idx, :] # [num_points, 4] _, top_idx = (b_cls_pred_scores * b_centerness_pred).topk(nms_pre) top_class_pred_scores = b_cls_pred_scores[top_idx] top_class_pred_indexes = b_cls_pred_indexes[top_idx] top_centerness_pred = b_centerness_pred[top_idx] top_bbox_pred = b_bbox_pred[top_idx, :] # [topk, 4] nms_scores = top_class_pred_scores * top_centerness_pred top_bbox_pred = cv_ops.clip_boxes_to_image(top_bbox_pred, size=(ih, iw)) valid_mask = top_class_pred_scores > cls_score_thr valid_class_pred_scores = top_class_pred_scores[valid_mask] valid_class_pred_indexes = top_class_pred_indexes[valid_mask] valid_nms_scores = nms_scores[valid_mask] valid_bbox_pred = top_bbox_pred[valid_mask, :] keep_idx = cv_ops.batched_nms(valid_bbox_pred, valid_nms_scores, valid_class_pred_indexes, iou_thr) keep_class_pred_scores = valid_class_pred_scores[keep_idx] keep_class_pred_indexes = valid_class_pred_indexes[keep_idx] keep_bbox_pred = valid_bbox_pred[keep_idx, :] oh, ow = img_info_list[batch_idx]['height'], img_info_list[batch_idx]['width'] keep_bbox_pred = bbox_ops.recover_bboxes(keep_bbox_pred, oh, ow, ih, iw) keep_bbox_pred = cv_ops.box_convert(keep_bbox_pred, in_fmt='xyxy', out_fmt='xywh') for cls_score, cls_idx, bbox in zip(keep_class_pred_scores, keep_class_pred_indexes, keep_bbox_pred): # poly = coco_mask.frPyObjects(poly.permute(1, 0).reshape(1, -1).detach().cpu().double().numpy(), oh, ow) # rle = coco_mask.merge(poly) # rle['counts'] = rle['counts'].decode('utf-8') pred_instances.append({ 'image_id': int(img_ids[batch_idx]), 'category_id': label_to_cat_map[int(cls_idx) + 1], 'bbox': [float(str('%.1f' % coord)) for coord in bbox.tolist()], # 'segmentation': rle, 'score': float(str('%.1f' % cls_score)) }) dist_logger.save_pred_instances_local_rank(pred_instances) dist_logger.save_val_file() dist_logger.evaluate(coco_gt)