def _nms(self, cxywh_score_cls, nms_mode=4): """ Non maximum suppression. Source: https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ Args: cxywh_score_cls (tensor): Bounding boxes and scores from get_detections. Assumes columns 0:4 are cx, cy, w, h, Column 4 is confidence, and column 5 is class id. Return: (tensor): Pruned boxes Examples: >>> import torch >>> torch.random.manual_seed(0) >>> anchors = np.array([(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053), (11.2364, 10.0071)]) >>> self = GetBoundingBoxes(anchors=anchors, num_classes=20, conf_thresh=.01, nms_thresh=0.5) >>> output = torch.randn(8, 5, 5 + 20, 9, 9) >>> boxes_ = self._get_boxes(output.data) >>> boxes = torch.Tensor(boxes_[0]) >>> ans0 = self._nms(boxes, nms_mode=0) >>> ans1 = self._nms(boxes, nms_mode=1) >>> ans2 = self._nms(boxes, nms_mode=2) Ignore: >>> from netharn import util >>> scores = boxes[..., 4:5] >>> classes = boxes[..., 5:6] >>> cxywh = util.Boxes(boxes[..., 0:4], 'cxywh') >>> tlbr = cxywh.to_tlbr() >>> util.non_max_supression(tlbr.data.numpy(), scores.numpy().ravel(), self.nms_thresh) Benchmark: boxes = torch.Tensor(boxes_[0]) import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+cpu'): with timer: self._nms(boxes, nms_mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+cpu'): with timer: self._nms(boxes, nms_mode=1) boxes = boxes.to() import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+gpu'): with timer: self._nms(boxes, nms_mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+gpu'): with timer: self._nms(boxes, nms_mode=1) """ if cxywh_score_cls.numel() == 0: return cxywh_score_cls a = cxywh_score_cls[:, :2] b = cxywh_score_cls[:, 2:4] # convert to tlbr tlbr_tensor = torch.cat([a - b / 2, a + b / 2], 1) scores = cxywh_score_cls[:, 4] if nms_mode == 0: # if torch.cuda.is_available: # boxes = boxes.to(0) from netharn.util._nms_backend.torch_nms import torch_nms cls_tensor = cxywh_score_cls[:, 5] keep = torch_nms(tlbr_tensor, scores, classes=cls_tensor, thresh=self.nms_thresh, bias=0) return cxywh_score_cls[keep] # keep = _nms_torch(tlbr_tensor, scores, nms_thresh=self.nms_thresh) # keep = sorted(keep) elif nms_mode == 1: # Dont group by classes, just NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) keep = util.non_max_supression(tlbr_np, scores_np, self.nms_thresh, bias=0) keep = sorted(keep) elif nms_mode == 2: # Group and use NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) classes_np = cxywh_score_cls[:, 5].cpu().numpy().astype(np.int) keep = util.non_max_supression(tlbr_np, scores_np, self.nms_thresh, classes=classes_np, bias=0) # keep = [] # for idxs in ub.group_items(range(len(classes_np)), classes_np).values(): # cls_tlbr_np = tlbr_np.take(idxs, axis=0) # cls_scores_np = scores_np.take(idxs, axis=0) # cls_keep = util.non_max_supression(cls_tlbr_np, cls_scores_np, # self.nms_thresh, bias=0) # keep.extend(list(ub.take(idxs, cls_keep))) keep = sorted(keep) elif nms_mode == 3: # Group and use NMS classes_np = cxywh_score_cls[:, 5].cpu().numpy().astype(np.int) keep = util.non_max_supression(tlbr_tensor, scores, self.nms_thresh, classes=classes_np, bias=0, impl='torch') keep = sorted(keep) elif nms_mode == 4: # Dont group, but use torch from netharn.util._nms_backend.torch_nms import torch_nms keep = torch_nms(tlbr_tensor, scores, thresh=self.nms_thresh, bias=0) return cxywh_score_cls[keep] else: raise KeyError(nms_mode) return cxywh_score_cls[torch.LongTensor(keep)]
def _benchmark(): import ubelt import torch import numpy as np import netharn as nh from netharn.util.nms.torch_nms import torch_nms from netharn.util import non_max_supression import ubelt as ub import itertools as it N = 100 bestof = 10 ydata = ub.ddict(list) xdata = [ 10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, 2000 ] rng = nh.util.ensure_rng(0) thresh = 0.5 for num in xdata: outputs = {} # Build random test boxes and scores boxes = nh.util.Boxes.random(num, scale=10.0, rng=rng, format='tlbr', tensor=True).data scores = torch.Tensor(rng.rand(len(boxes))) t1 = ubelt.Timerit(N, bestof=bestof, label='torch(cpu)') for timer in t1: with timer: keep = torch_nms(boxes, scores, thresh=thresh) ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] if torch.cuda.is_available(): # Move boxes to the GPU gpu_boxes = boxes.cuda() gpu_scores = scores.cuda() t1 = ubelt.Timerit(N, bestof=bestof, label='torch(gpu)') for timer in t1: with timer: keep = torch_nms(gpu_boxes, gpu_scores, thresh=thresh) torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] # Move boxes to numpy np_boxes = boxes.cpu().numpy() np_scores = scores.cpu().numpy() t1 = ubelt.Timerit(N, bestof=bestof, label='numpy(cpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='py') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) t1 = ubelt.Timerit(N, bestof=bestof, label='cython(cpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='cpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if torch.cuda.is_available(): t1 = ubelt.Timerit(N, bestof=bestof, label='cython(gpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='gpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) # Check that all kept boxes do not have more than `threshold` ious for key, idxs in outputs.items(): ious = nh.util.box_ious(np_boxes[idxs], np_boxes[idxs]) max_iou = (np.tril(ious) - np.eye(len(ious))).max() if max_iou > thresh: print('{} produced a bad result with max_iou={}'.format( key, max_iou)) # Check result consistency: print('Result consistency:') for k1, k2 in it.combinations(outputs.keys(), 2): idxs1 = set(outputs[k1]) idxs2 = set(outputs[k2]) jaccard = len(idxs1 & idxs2) / len(idxs1 | idxs2) print('{}, {}: {}'.format(k1, k2, jaccard)) nh.util.mplutil.qtensure() nh.util.mplutil.multi_plot(xdata, ydata, xlabel='num boxes', ylabel='seconds')
def _benchmark(): """ python -m netharn.util.nms.torch_nms _benchmark --show SeeAlso: PJR Darknet NonMax supression https://github.com/pjreddie/darknet/blob/master/src/box.c Lightnet NMS https://gitlab.com/EAVISE/lightnet/blob/master/lightnet/data/transform/_postprocess.py#L116 """ import torch import numpy as np import netharn as nh from netharn.util.nms.torch_nms import torch_nms from netharn.util import non_max_supression import ubelt as ub import itertools as it N = 100 bestof = 10 ydata = ub.ddict(list) # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, 2000] # max number of boxes yolo will spit out at a time max_boxes = 19 * 19 * 5 xdata = [ 10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, max_boxes ] # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500] xdata = [10, 100, 500] rng = nh.util.ensure_rng(0) thresh = 0.5 for num in xdata: print('\n\n---- number of boxes = {} ----\n'.format(num)) outputs = {} # Build random test boxes and scores cpu_boxes = nh.util.Boxes.random(num, scale=10.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr))) # Format boxes in lightnet format cpu_ln_boxes = torch.cat([ cpu_boxes.to_cxywh().data, cpu_scores[:, None], cpu_cls.float()[:, None] ], dim=-1) # Move boxes to numpy np_tlbr = cpu_tlbr.numpy() np_scores = cpu_scores.numpy() np_cls = cpu_cls.numpy() # NOQA gpu = torch.device('cuda', 0) measure_gpu = torch.cuda.is_available() measure_cpu = False or not torch.cuda.is_available() def _ln_output_to_keep(ln_output, ln_boxes): keep = [] for row in ln_output: # Find the index that we kept idxs = np.where(np.all(np.isclose(ln_boxes, row), axis=1))[0] assert len(idxs) == 1 keep.append(idxs[0]) assert np.all(np.isclose(ln_boxes[keep], ln_output)) return keep if measure_gpu: # Move boxes to the GPU gpu_tlbr = cpu_tlbr.to(gpu) gpu_scores = cpu_scores.to(gpu) gpu_cls = cpu_cls.to(gpu) # NOQA gpu_ln_boxes = cpu_ln_boxes.to(gpu) t1 = ub.Timerit(N, bestof=bestof, label='torch(gpu)') for timer in t1: with timer: keep = torch_nms(gpu_tlbr, gpu_scores, thresh=thresh) torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] t1 = ub.Timerit(N, bestof=bestof, label='cython(gpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='gpu') torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) from lightnet.data.transform._postprocess import NonMaxSupression t1 = ub.Timerit(N, bestof=bestof, label='lightnet-slow(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=False) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if False: t1 = ub.Timerit(N, bestof=bestof, label='lightnet-fast(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=True) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if measure_cpu: t1 = ub.Timerit(N, bestof=bestof, label='torch(cpu)') for timer in t1: with timer: keep = torch_nms(cpu_tlbr, cpu_scores, thresh=thresh) ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] if True: t1 = ub.Timerit(N, bestof=bestof, label='cython(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='cpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) t1 = ub.Timerit(N, bestof=bestof, label='numpy(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='py') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) # Check that all kept boxes do not have more than `threshold` ious for key, idxs in outputs.items(): ious = nh.util.box_ious(np_tlbr[idxs], np_tlbr[idxs]) max_iou = (np.tril(ious) - np.eye(len(ious))).max() if max_iou > thresh: print('{} produced a bad result with max_iou={}'.format( key, max_iou)) # Check result consistency: print('\nResult stats:') for key in sorted(outputs.keys()): print(' * {:<20}: num={}'.format(key, len(outputs[key]))) print('\nResult overlaps (method1, method2: jaccard):') datas = [] for k1, k2 in it.combinations(sorted(outputs.keys()), 2): idxs1 = set(outputs[k1]) idxs2 = set(outputs[k2]) jaccard = len(idxs1 & idxs2) / len(idxs1 | idxs2) datas.append((k1, k2, jaccard)) datas = sorted(datas, key=lambda x: -x[2]) for k1, k2, jaccard in datas: print(' * {:<20}, {:<20}: {:0.4f}'.format(k1, k2, jaccard)) nh.util.mplutil.autompl() nh.util.mplutil.multi_plot(xdata, ydata, xlabel='num boxes', ylabel='seconds') nh.util.show_if_requested()
def _nms(self, boxes, mode=0): """ Non maximum suppression. Source: https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ Args: boxes (tensor): Bounding boxes from get_detections Return: (tensor): Pruned boxes CommandLine: python -m netharn.models.yolo2.light_postproc GetBoundingBoxes._nms --profile Examples: >>> import torch >>> torch.random.manual_seed(0) >>> anchors = np.array([(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053), (11.2364, 10.0071)]) >>> self = GetBoundingBoxes(anchors=anchors, num_classes=20, conf_thresh=.01, nms_thresh=0.5) >>> output = torch.randn(8, 5, 5 + 20, 9, 9) >>> boxes_ = self._get_boxes(output.data) >>> boxes = torch.Tensor(boxes_[0]) >>> ans0 = self._nms(boxes, mode=0) >>> ans1 = self._nms(boxes, mode=1) >>> ans2 = self._nms(boxes, mode=2) Ignore: >>> from netharn import util >>> scores = boxes[..., 4:5] >>> classes = boxes[..., 5:6] >>> cxywh = util.Boxes(boxes[..., 0:4], 'cxywh') >>> tlbr = cxywh.to_tlbr() >>> util.non_max_supression(tlbr.data.numpy(), scores.numpy().ravel(), self.nms_thresh) Benchmark: boxes = torch.Tensor(boxes_[0]) import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+cpu'): with timer: self._nms(boxes, mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+cpu'): with timer: self._nms(boxes, mode=1) boxes = boxes.cuda() import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+gpu'): with timer: self._nms(boxes, mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+gpu'): with timer: self._nms(boxes, mode=1) """ if boxes.numel() == 0: return boxes a = boxes[:, :2] b = boxes[:, 2:4] # convert to tlbr tlbr_tensor = torch.cat([a - b / 2, a + b / 2], 1) scores = boxes[:, 4] if mode == 0: # if torch.cuda.is_available: # boxes = boxes.cuda() keep = _nms_torch(tlbr_tensor, scores, nms_thresh=self.nms_thresh) keep = sorted(keep) elif mode == 1: # Dont group by classes, just NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) keep = util.non_max_supression(tlbr_np, scores_np, self.nms_thresh) keep = sorted(keep) elif mode == 2: # Group and use NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) classes_np = boxes[..., 5].cpu().numpy().astype(np.int) keep = [] for idxs in ub.group_items(range(len(classes_np)), classes_np).values(): cls_tlbr_np = tlbr_np.take(idxs, axis=0) cls_scores_np = scores_np.take(idxs, axis=0) cls_keep = util.non_max_supression(cls_tlbr_np, cls_scores_np, self.nms_thresh) keep.extend(list(ub.take(idxs, cls_keep))) keep = sorted(keep) else: raise KeyError(mode) return boxes[torch.LongTensor(keep)]