def __call__(self, image: pintu.imaging.Image) -> List[Dict]: """ Perform object detection inference on an image :param image: Image to detect objects in. :return: List one dictionary per detected object, each dictionars containing the following fields: - "class" : (str) The name of the object detected. - "confidence" : (float) The confidence in the detection. - "left" : (float) Absolute x-coordinate of the left edge. - "top" : (float) Absolute y-coordinate of the top edge. - "right" : (float) Absolute x-coordinate of the right edge. - "bottom" : (float) Absolute y-coordinate of the bottom edge. """ scale = 1.0 if image.width > image.height: scale = float(self.input_shape[1]) / image.width w = self.input_shape[1] h = int(image.height * scale) else: scale = float(self.input_shape[0]) / image.height h = self.input_shape[0] w = int(image.width * scale) mat_in = ncnn.Mat.from_pixels_resize( image.data, ncnn.Mat.PixelType.PIXEL_BGR, image.width, image.height, w, h, ) # pad to target_size rectangle wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 0, ) # Normalize image mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) # ex = self.net.create_extractor() ex.input(self.input_name, mat_in_pad) scores = [ex.extract(x)[1] for x in self.score_output_names] scores = [numpy.reshape(x, (-1, 80)) for x in scores] raw_boxes = [ex.extract(x)[1] for x in self.boxes_output_names] raw_boxes = [numpy.reshape(x, (-1, 32)) for x in raw_boxes] # generate centers decode_boxes = [] select_scores = [] for stride, box_distribute, score in zip(self.strides, raw_boxes, scores): # centers if mat_in_pad.w > mat_in_pad.h: fm_w = mat_in_pad.w // stride fm_h = score.shape[0] // fm_w else: fm_h = mat_in_pad.h // stride fm_w = score.shape[1] // fm_h h_range = numpy.arange(fm_h) w_range = numpy.arange(fm_w) ww, hh = numpy.meshgrid(w_range, h_range) ct_row = (hh.flatten() + 0.5) * stride ct_col = (ww.flatten() + 0.5) * stride center = numpy.stack((ct_col, ct_row, ct_col, ct_row), axis=1) # box distribution to distance reg_range = numpy.arange(self.reg_max + 1) box_distance = box_distribute.reshape((-1, self.reg_max + 1)) box_distance = ncnn.utils.functional.softmax(box_distance) box_distance = box_distance * numpy.expand_dims(reg_range, axis=0) box_distance = numpy.sum(box_distance, axis=1).reshape((-1, 4)) box_distance = box_distance * stride # top K candidate topk_idx = numpy.argsort(score.max(axis=1))[::-1] topk_idx = topk_idx[:self.num_candidate] center = center[topk_idx] score = score[topk_idx] box_distance = box_distance[topk_idx] # decode box decode_box: List[int] = center + [-1, -1, 1, 1] * box_distance select_scores.append(score) decode_boxes.append(decode_box) # nms bboxes = numpy.concatenate(decode_boxes, axis=0) confidences = numpy.concatenate(select_scores, axis=0) picked_box = [] picked_probs = [] picked_labels = [] for class_index in range(0, confidences.shape[1]): probs = confidences[:, class_index] mask = probs > self.prob_threshold probs = probs[mask] if probs.shape[0] == 0: continue subset_boxes = bboxes[mask, :] picked = ncnn.utils.functional.nms( subset_boxes, probs, iou_threshold=self.nms_threshold, top_k=self.top_k, ) picked_box.append(subset_boxes[picked]) picked_probs.append(probs[picked]) picked_labels.extend([class_index] * len(picked)) if not picked_box: return [] picked_box = numpy.concatenate(picked_box) picked_probs = numpy.concatenate(picked_probs) return [{ "class": str(self.class_names[label]), "confidence": float(score), "left": float((bbox[0] - wpad / 2) / scale if bbox[0] > 0 else 0), "top": float((bbox[1] - hpad / 2) / scale if bbox[1] > 0 else 0), "right": float((bbox[2] - wpad / 2) / scale if bbox[2] < mat_in_pad.w else mat_in_pad.w / scale), "bottom": float((bbox[3] - wpad / 2) / scale if bbox[3] < mat_in_pad.h else mat_in_pad.h / scale), } for label, score, bbox in zip(picked_labels, picked_probs, picked_box)]
def __call__(self, img): img_w = img.shape[1] img_h = img.shape[0] w = img_w h = img_h scale = 1.0 if w > h: scale = float(self.target_size) / w w = self.target_size h = int(h * scale) else: scale = float(self.target_size) / h h = self.target_size w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize(img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, w, h) # pad to target_size rectangle # yolov5/utils/datasets.py letterbox wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 114.0, ) mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("images", mat_in_pad) # anchor setting from yolov5/models/yolov5s.yaml ret1, mat_out1 = ex.extract("output") # stride 8 ret2, mat_out2 = ex.extract("781") # stride 16 ret3, mat_out3 = ex.extract("801") # stride 32 pred = [np.array(mat_out3), np.array(mat_out2), np.array(mat_out1)] z = [] for i in range(len(pred)): num_grid = pred[i].shape[1] if mat_in_pad.w > mat_in_pad.h: num_grid_x = mat_in_pad.w // self.stride[i] num_grid_y = num_grid // num_grid_x else: num_grid_y = mat_in_pad.h // self.stride[i] num_grid_x = num_grid // num_grid_y if (self.grid[i].shape[0] != num_grid_x or self.grid[i].shape[1] != num_grid_y): self.grid[i] = make_grid(num_grid_x, num_grid_y) y = sigmoid(pred[i]) y = y.reshape(pred[i].shape[0], num_grid_y, num_grid_x, pred[i].shape[2]) y[..., 0:2] = (y[..., 0:2] * 2.0 - 0.5 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2)**2 * self.anchor_grid[i] # wh z.append(y.reshape(1, -1, y.shape[-1])) pred = np.concatenate(z, 1) result = non_max_suppression(pred, self.prob_threshold, self.nms_threshold)[0] objects = [ Detect_Object( obj[5], obj[4], obj[0] / scale, obj[1] / scale, (obj[2] - obj[0]) / scale, (obj[3] - obj[1]) / scale, ) for obj in result ] return objects
def __call__(self, img): img_w = img.shape[1] img_h = img.shape[0] w = img_w h = img_h scale = 1.0 if w > h: scale = float(self.target_size) / w w = self.target_size h = int(h * scale) else: scale = float(self.target_size) / h h = self.target_size w = int(w * scale) mat_in = ncnn.Mat.from_pixels_resize(img, ncnn.Mat.PixelType.PIXEL_BGR, img_w, img_h, w, h) # pad to target_size rectangle wpad = (w + 31) // 32 * 32 - w hpad = (h + 31) // 32 * 32 - h mat_in_pad = ncnn.copy_make_border( mat_in, hpad // 2, hpad - hpad // 2, wpad // 2, wpad - wpad // 2, ncnn.BorderType.BORDER_CONSTANT, 0, ) mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals) ex = self.net.create_extractor() ex.input("input.1", mat_in_pad) score_out_name = ["792", "814", "836"] scores = [ex.extract(x)[1] for x in score_out_name] scores = [np.reshape(x, (-1, 80)) for x in scores] boxes_out_name = ["795", "817", "839"] raw_boxes = [ex.extract(x)[1] for x in boxes_out_name] raw_boxes = [np.reshape(x, (-1, 32)) for x in raw_boxes] # generate centers decode_boxes = [] select_scores = [] for stride, box_distribute, score in zip(self.strides, raw_boxes, scores): # centers if mat_in_pad.w > mat_in_pad.h: fm_w = mat_in_pad.w // stride fm_h = score.shape[0] // fm_w else: fm_h = mat_in_pad.h // stride fm_w = score.shape[1] // fm_h h_range = np.arange(fm_h) w_range = np.arange(fm_w) ww, hh = np.meshgrid(w_range, h_range) ct_row = (hh.flatten() + 0.5) * stride ct_col = (ww.flatten() + 0.5) * stride center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1) # box distribution to distance reg_range = np.arange(self.reg_max + 1) box_distance = box_distribute.reshape((-1, self.reg_max + 1)) box_distance = softmax(box_distance) box_distance = box_distance * np.expand_dims(reg_range, axis=0) box_distance = np.sum(box_distance, axis=1).reshape((-1, 4)) box_distance = box_distance * stride # top K candidate topk_idx = np.argsort(score.max(axis=1))[::-1] topk_idx = topk_idx[:self.num_candidate] center = center[topk_idx] score = score[topk_idx] box_distance = box_distance[topk_idx] # decode box decode_box = center + [-1, -1, 1, 1] * box_distance select_scores.append(score) decode_boxes.append(decode_box) # nms bboxes = np.concatenate(decode_boxes, axis=0) confidences = np.concatenate(select_scores, axis=0) picked_box = [] picked_probs = [] picked_labels = [] for class_index in range(0, confidences.shape[1]): probs = confidences[:, class_index] mask = probs > self.prob_threshold probs = probs[mask] if probs.shape[0] == 0: continue subset_boxes = bboxes[mask, :] picked = nms( subset_boxes, probs, iou_threshold=self.nms_threshold, top_k=self.top_k, ) picked_box.append(subset_boxes[picked]) picked_probs.append(probs[picked]) picked_labels.extend([class_index] * len(picked)) if not picked_box: return [] picked_box = np.concatenate(picked_box) picked_probs = np.concatenate(picked_probs) # result with clip objects = [ Detect_Object( label, score, (bbox[0] - wpad / 2) / scale if bbox[0] > 0 else 0, (bbox[1] - hpad / 2) / scale if bbox[1] > 0 else 0, (bbox[2] - bbox[0]) / scale if bbox[2] < mat_in_pad.w else (mat_in_pad.w - bbox[0]) / scale, (bbox[3] - bbox[1]) / scale if bbox[3] < mat_in_pad.h else (mat_in_pad.h - bbox[1]) / scale, ) for label, score, bbox in zip(picked_labels, picked_probs, picked_box) ] return objects