def test_random_crop_with_bbox_constraints(self): img = np.random.randint(0, 256, size=(3, 480, 640)).astype(np.float32) bbox = generate_random_bbox(10, img.shape[1:], 0.1, 0.9) out, param = random_crop_with_bbox_constraints(img, bbox, min_scale=0.3, max_scale=1, max_aspect_ratio=2, return_param=True) if param['constraint'] is None: np.testing.assert_equal(out, img) else: np.testing.assert_equal(out, img[:, param['y_slice'], param['x_slice']]) self.assertGreaterEqual(out.size, img.size * 0.3 * 0.3) self.assertLessEqual(out.size, img.size * 1 * 1) # to ignore rounding error, add 1 self.assertLessEqual(out.shape[1] / (out.shape[2] + 1), img.shape[1] / img.shape[2] * 2) self.assertLessEqual(out.shape[2] / (out.shape[1] + 1), img.shape[2] / img.shape[1] * 2) bb = np.array((param['y_slice'].start, param['x_slice'].start, param['y_slice'].stop, param['x_slice'].stop)) iou = bbox_iou(bb[np.newaxis], bbox) min_iou, max_iou = param['constraint'] if min_iou: self.assertGreaterEqual(iou.min(), min_iou) if max_iou: self.assertLessEqual(iou.max(), max_iou)
def check(self, bbox_a, bbox_b, expected): iou = bbox_iou(bbox_a, bbox_b) self.assertIsInstance(iou, type(expected)) np.testing.assert_equal( cuda.to_cpu(iou), cuda.to_cpu(expected))
def __call__(self, *inputs): images, labels = inputs[:2] with cuda.Device(self.device): _, bboxes = self.link(images) bboxes = cuda.to_cpu(bboxes.data) labels = cuda.to_cpu(labels) xp = cuda.get_array_module(bboxes) bboxes = self.extract_corners(bboxes) bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:])) ious = bbox_iou(bboxes.data.copy(), xp.squeeze(labels))[xp.eye(len(bboxes)).astype(xp.bool)] mean_iou = ious.mean() reporter.report({'mean_iou': mean_iou}) pred_bboxes = [bbox.data[xp.newaxis, ...].astype(xp.int32) for bbox in F.separate(bboxes, axis=0)] pred_scores = xp.ones((len(bboxes), 1)) pred_labels = xp.zeros_like(pred_scores) gt_bboxes = [bbox.data[...] for bbox in F.separate(labels, axis=0)] gt_labels = xp.zeros_like(pred_scores) result = chainercv.evaluations.eval_detection_voc( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels ) reporter.report({'map': result['map']}) reporter.report({'ap/sheep': result['ap'][0]})
def _asign_gt_to_anchor(self, anchors, locs, confs, gt_bboxes, gt_labels): _anchors, _locs, _confs = [], [], [] _gt_labels, _gt_bboxes = [], [] for anchor, loc, conf, gt_bbox, gt_label in zip( anchors, locs, confs, gt_bboxes, gt_labels): if gt_label.shape[0] > 0: iou = bbox_iou(anchor, gt_bbox) max_iou = self.xp.max(iou, axis=-1) max_iou_indices = self.xp.argmax(iou, axis=-1) else: # guard no annotation max_iou = self.xp.zeros(conf.shape[0], self.xp.float32) max_iou_indices = self.xp.empty(conf.shape[0], self.xp.float32) fg_mask = max_iou > self._fg_thresh bg_mask = max_iou < self._bg_thresh n_bg = self.xp.where(bg_mask)[0].shape[0] max_iou_indices_fg = max_iou_indices[fg_mask] _gt_label_fg = self.xp.array( [gt_label[i] + 1 for i in max_iou_indices_fg], self.xp.int32) _gt_bbox_fg = self.xp.array( [gt_bbox[i] for i in max_iou_indices_fg], self.xp.float32) if _gt_bbox_fg.shape[0] == 0: # guard not fg anchor _gt_bbox_fg = self.xp.empty((0, 4), self.xp.float32) _anchors.append(F.vstack((anchor[fg_mask], anchor[bg_mask]))) _locs.append(F.vstack((loc[fg_mask], loc[bg_mask]))) _confs.append(F.vstack((conf[fg_mask], conf[bg_mask]))) _gt_bboxes.append( self.xp.vstack((_gt_bbox_fg, self.xp.zeros((n_bg, 4))))) _gt_labels.append( self.xp.hstack((_gt_label_fg, self.xp.zeros(n_bg)))) return _anchors, _locs, _confs, _gt_bboxes, _gt_labels
def merge_entries(entry1, entry2, thresh): bbox1, score1 = entry1['bbox'], entry1['score'] bbox2, score2 = entry2['bbox'], entry2['score'] bbox = np.concatenate((bbox1, bbox2), axis=0) score = np.concatenate((score1, score2), axis=0) if len(score) == 0: return bbox, score order = score.argsort()[::-1] bbox = bbox[order] score = score[order] iou = bbox_iou(bbox, bbox) iou *= 1 - np.eye(len(bbox)) # ignore IoU with itself new_bbox = [] new_score = [] for i in range(len(bbox)): max_iou = iou[i].max() if max_iou <= thresh: new_bbox.append(bbox[i]) new_score.append(score[i]) else: max_index = iou[i].argmax() if max_index > i: new_bbox.append(get_bbox_intersection(bbox[i], bbox[max_index])) new_score.append(score[i]) new_bbox = np.array(new_bbox, dtype=np.float32).reshape(-1, 4) new_score = np.array(new_score, dtype=np.float32) return new_bbox, new_score
def crop_with_bbox_constraints( img, bbox, crop_width=None, crop_height=None, constraints=None, max_trial=10, return_param=False): if constraints is None: constraints = ( (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1), ) _, H, W = img.shape crop_h = int(crop_height) crop_w = int(crop_width) diff_h = int((H - crop_h) / 2.) diff_w = int((W - crop_w) / 2.) params = [{ 'constraint': None, 'y_slice': slice(diff_h, diff_h + crop_h), 'x_slice': slice(diff_w, diff_w + crop_w)}] if len(bbox) == 0: constraints = list() range_H = H - crop_h range_W = W - crop_w for min_iou, max_iou in constraints: if min_iou is None: min_iou = 0 if max_iou is None: max_iou = 1 for _ in six.moves.range(max_trial): crop_t = 0 if range_H == 0 else random.randrange(range_H) crop_l = 0 if range_W == 0 else random.randrange(range_W) crop_bb = np.array(( crop_t, crop_l, crop_t + crop_h, crop_l + crop_w)) iou = utils.bbox_iou(bbox, crop_bb[np.newaxis]) if min_iou < iou.min() and iou.max() <= max_iou: params.append({ 'constraint': (min_iou, max_iou), 'y_slice': slice(crop_t, crop_t + crop_h), 'x_slice': slice(crop_l, crop_l + crop_w)}) break param = random.choice(params) img = img[:, param['y_slice'], param['x_slice']] if return_param: return img, param else: return img
def calc_loss(self, image_size, predicted_grids, gt_bbox_points, objectness_scores): predicted_bbox_points = self.get_corners(predicted_grids, image_size, scale_to_image_size=False) # 1. transform box coordinates to aabb coordinates for determination of iou predicted_bbox_points = predicted_bbox_points[0], predicted_bbox_points[3], predicted_bbox_points[1], predicted_bbox_points[5] predicted_bbox_points = F.stack(predicted_bbox_points, axis=1) # 2. find best prediction area for each gt bbox gt_bboxes_to_use_for_loss = [] positive_anchor_indices = self.xp.empty((0,), dtype=self.xp.int32) not_contributing_anchors = self.xp.empty((0,), dtype=self.xp.int32) for index, gt_bbox in enumerate(gt_bbox_points): # determine which bboxes are positive boxes as they have high iou with gt and also which bboxes are negative # this is also used to train objectness classification gt_bbox = self.xp.tile(gt_bbox[None, ...], (len(predicted_bbox_points), 1)) ious = bbox_iou(gt_bbox, predicted_bbox_points.data) positive_boxes = self.xp.where((ious[0] >= 0.7)) not_contributing_boxes = self.xp.where(self.xp.logical_and(0.3 < ious[0], ious[0] < 0.7)) if len(positive_boxes[0]) == 0: best_iou_index = ious[0, :].argmax() positive_anchor_indices = self.xp.concatenate((positive_anchor_indices, best_iou_index[None, ...]), axis=0) gt_bboxes_to_use_for_loss.append(gt_bbox[0]) else: positive_anchor_indices = self.xp.concatenate((positive_anchor_indices, positive_boxes[0]), axis=0) gt_bboxes_to_use_for_loss.extend(gt_bbox[:len(positive_boxes[0])]) not_contributing_anchors = self.xp.concatenate((not_contributing_anchors, not_contributing_boxes[0]), axis=0) if len(gt_bboxes_to_use_for_loss) == 0: return Variable(self.xp.array(0, dtype=predicted_grids.dtype)) gt_bboxes_to_use_for_loss = F.stack(gt_bboxes_to_use_for_loss) # filter predicted bboxes and only keep bboxes from those regions that actually contain a bbox predicted_bbox_points = F.get_item(predicted_bbox_points, positive_anchor_indices) # 3. calculate L1 loss for bbox regression loss = F.huber_loss( predicted_bbox_points, gt_bboxes_to_use_for_loss, 1 ) # 4. calculate objectness loss objectness_labels = self.xp.zeros(len(objectness_scores), dtype=self.xp.int32) objectness_labels[not_contributing_anchors] = -1 objectness_labels[positive_anchor_indices] = 1 objectness_loss = F.softmax_cross_entropy( objectness_scores, objectness_labels, ignore_label=-1, ) return F.mean(loss), objectness_loss
def rebase_sst(self, s_in, s_st, bboxes): _sst = [] for sin, sst, bbox in zip(s_in, s_st, bboxes): n, h, w = sst.shape union_masks = np.empty((n, h, w), dtype=np.float32) for idx, s_mask in enumerate(sst): union_masks[idx] = np.bitwise_or(sin, s_mask) union_bboxes = mask_to_bbox(union_masks) iou = np.squeeze(bbox_iou(union_bboxes, np.array([bbox]))) order = np.argsort(iou, axis=0)[::-1] _sst.append(sst[order]) return _sst
def box_alignment(self, img, bboxes, masks, boxes): s_in, s_st = self.get_initial_sets(img, bboxes, masks, boxes) if len(s_in) == 0 or len(s_st) == 0: return [], [], [] s_st = self.rebase_sst(s_in, s_st, bboxes) final_boxes = [] final_masks = [] added_superpixel_masks = [] for bbox, sin, sst in zip(bboxes, s_in, s_st): s = sin if s.ndim == 0: continue assert len(sst) >= 1, "No straddling boxes are found" proc = 0 new_superpixels = np.zeros_like(s) new_s = np.bitwise_or(s, sst[0]) iou_old = bbox_iou(mask_to_bbox(np.array([s])), np.array([bbox]))[0][0] iou_new = bbox_iou(mask_to_bbox(np.array([new_s])), np.array([bbox]))[0][0] for sk in sst[1:]: if iou_old > iou_new: break iou_old = iou_new s = new_s new_s = np.bitwise_or(s, sk) iou_new = bbox_iou(mask_to_bbox(np.array([new_s])), np.array([bbox]))[0][0] proc += 1 new_superpixels = np.bitwise_or(new_superpixels, sk) final_masks.append(s) final_boxes.append(mask_to_bbox(np.array([s]))[-1]) added_superpixel_masks.append(new_superpixels.astype(np.int32)) if self.verbosity: print('No. of superpixels added: {:2d}'.format(proc)) final_masks, final_boxes = np.array(final_masks), np.array(final_boxes) return final_boxes, final_masks, added_superpixel_masks
def encode(self, bbox, label, iou_thresh=0.5): xp = self.xp if len(bbox) == 0: return (xp.zeros(self._default_bbox.shape, dtype=np.float32), xp.zeros(self._default_bbox.shape[0], dtype=np.int32)) iou = utils.bbox_iou( xp.hstack( (self._default_bbox[:, :2] - self._default_bbox[:, 2:] / 2, self._default_bbox[:, :2] + self._default_bbox[:, 2:] / 2)), bbox) index = xp.empty(len(self._default_bbox), dtype=int) index[:] = -1 # background masked_iou = iou.copy() while True: i, j = xp.unravel_index(masked_iou.argmax(), masked_iou.shape) if masked_iou[i, j] < 1e-6: break index[i] = j masked_iou[i, :] = 0 masked_iou[:, j] = 0 mask = xp.logical_and(index < 0, iou.max(axis=1) >= iou_thresh) index[mask] = iou[mask].argmax(axis=1) mb_bbox = bbox[index].copy() mb_bbox[:, 2:] -= mb_bbox[:, :2] mb_bbox[:, :2] += mb_bbox[:, 2:] / 2 mb_loc = xp.empty_like(mb_bbox) mb_loc[:, :2] = (mb_bbox[:, :2] - self._default_bbox[:, :2]) / \ (self._variance[0] * self._default_bbox[:, 2:]) mb_loc[:, 2:] = xp.log(mb_bbox[:, 2:] / self._default_bbox[:, 2:]) / \ self._variance[1] mb_label = label[index] + 1 mb_label[index < 0] = 0 return mb_loc.astype(np.float32), mb_label.astype(np.int32)
def get_loss(self, g_bboxes, g_labels, p_bboxes, p_confs, p_objs ): """ Generate loss """ b_loss = 0 c_loss = 0 p_loss = 0 for g_bbox, g_label, p_bbox, p_conf, p_obj in zip( g_bboxes, g_labels, p_bboxes, p_confs, p_objs ): IoU = bbox_iou(g_bbox, p_bbox) pick = self.xp.argmax(IoU, axis=-1) p_bbox = p_bbox[pick] p_conf = p_conf[pick] p_obj = p_obj[pick] b_loss += F.sum((p_bbox - g_bbox) ** 2) c_loss += F.sum((p_conf - ))
def get_naive_zoom(image, paste_x, paste_y, stamp): zoom_ratio = random.random() * 10 + 0.3 crop_width = min(stamp.width + zoom_ratio * stamp.width, image.width) crop_height = min(stamp.height + zoom_ratio * stamp.height, image.height) width_insert_ratio = random.random() height_insert_ratio = random.random() insert_max = [min(paste_x, image.width - crop_width), min(paste_y, image.height - crop_height)] insert_min = [max(paste_x + stamp.width - crop_width, 0), max(paste_y + stamp.height - crop_height, 0)] for i in range(2): if insert_max[i] < insert_min[i]: insert_max[i] = insert_min[i] insert_point = [int(mi + ratio * (ma - mi)) for mi, ma, ratio in zip(insert_min, insert_max, [width_insert_ratio, height_insert_ratio])] crop_bbox = [insert_point[0], insert_point[1], insert_point[0] + crop_width, insert_point[1] + crop_height] paste_bbox = np.array([paste_x, paste_y, paste_x + stamp.width, paste_y + stamp.height]) stamp_with_background = image.crop(crop_bbox) iou = bbox_iou(np.array(crop_bbox)[None, ...], paste_bbox[None, ...])[0, 0] return stamp_with_background, iou
def get_iou_crop(image, paste_x, paste_y, stamp): global iou_index iou_index = (iou_index + 1) % len(iou_ranges) desired_iou = min(iou_ranges[iou_index % len(iou_ranges)] / 100, 1.0) num_retries = 0 good_bbox_found = False while not good_bbox_found and num_retries < 200: paste_bbox = np.array([paste_x, paste_y, paste_x + stamp.width, paste_y + stamp.height]) paste_bbox_size = paste_bbox[2:] - paste_bbox[:2] max_size_deviation = 1.0 - desired_iou for _ in range(200): if desired_iou < 0.3: crop_width = int(min(stamp.width + (1 - desired_iou) * 10 * stamp.width, image.width)) crop_height = int(min(stamp.height + (1 - desired_iou) * 10 * stamp.height, image.height)) else: crop_width = random.randint( max(int(paste_bbox_size[0] - paste_bbox_size[0] * max_size_deviation), 1), int(paste_bbox_size[0] + paste_bbox_size[0] * max_size_deviation) ) crop_height = random.randint( max(int(paste_bbox_size[1] - paste_bbox_size[1] * max_size_deviation), 1), int(paste_bbox_size[1] + paste_bbox_size[1] * max_size_deviation) ) crop_bbox = iou_crop(image, paste_bbox, crop_width, crop_height, desired_iou) ious = bbox_iou(crop_bbox[None, ...], paste_bbox[None, ...])[0] largest_iou = abs(np.max(ious)) if desired_iou - 0.05 < largest_iou <= desired_iou: good_bbox_found = True break num_retries += 1 if good_bbox_found is False: raise ValueError("No Good BBOX Found") return image.crop(crop_bbox), ious[0]
def test_random_crop_with_bbox_constraints(self): img = np.random.randint(0, 256, size=(3, 480, 640)).astype(np.float32) bbox = generate_random_bbox(10, img.shape[1:], 0.1, 0.9) out, param = random_crop_with_bbox_constraints( img, bbox, min_scale=0.3, max_scale=1, max_aspect_ratio=2, return_param=True) if param['constraint'] is None: np.testing.assert_equal(out, img) else: np.testing.assert_equal( out, img[:, param['y_slice'], param['x_slice']]) # to ignore rounding error, add 1 self.assertGreaterEqual( out.shape[0] * (out.shape[1] + 1) * (out.shape[2] + 1), img.size * 0.3 * 0.3) self.assertLessEqual(out.size, img.size * 1 * 1) self.assertLessEqual( out.shape[1] / (out.shape[2] + 1), img.shape[1] / img.shape[2] * 2) self.assertLessEqual( out.shape[2] / (out.shape[1] + 1), img.shape[2] / img.shape[1] * 2) bb = np.array(( param['y_slice'].start, param['x_slice'].start, param['y_slice'].stop, param['x_slice'].stop)) iou = bbox_iou(bb[np.newaxis], bbox) min_iou, max_iou = param['constraint'] if min_iou: self.assertGreaterEqual(iou.min(), min_iou) if max_iou: self.assertLessEqual(iou.max(), max_iou)
def test_bbox_iou_invalid(self): bbox_a = np.array(self.bbox_a, dtype=np.float32) bbox_b = np.array(self.bbox_b, dtype=np.float32) with self.assertRaises(IndexError): bbox_iou(bbox_a, bbox_b)
def check(self, bbox_a, bbox_b, expected): iou = bbox_iou(bbox_a, bbox_b) self.assertIsInstance(iou, type(expected)) np.testing.assert_equal(cuda.to_cpu(iou), cuda.to_cpu(expected))
def head_loss_pre(rois, roi_indices, std, bboxes, labels): thresh = 0.5 batchsize_per_image = 512 fg_ratio = 0.25 xp = cuda.get_array_module(*rois) n_level = len(rois) roi_levels = xp.hstack( xp.array((l,) * len(rois[l])) for l in range(n_level)).astype(np.int32) rois = xp.vstack(rois).astype(np.float32) roi_indices = xp.hstack(roi_indices).astype(np.int32) rois_yx = (rois[:, 2:] + rois[:, :2]) / 2 rois_hw = rois[:, 2:] - rois[:, :2] indices = np.unique(cuda.to_cpu(roi_indices)) gt_locs = xp.empty_like(rois) gt_labels = xp.empty_like(roi_indices) for i in indices: mask = roi_indices == i if len(bboxes[i]) > 0: iou = utils.bbox_iou(rois[mask], bboxes[i]) gt_index = iou.argmax(axis=1) gt_loc = bboxes[i][gt_index].copy() else: gt_loc = xp.empty_like(rois[mask]) # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - rois_yx[mask]) / \ rois_hw[mask] / std[0] gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / rois_hw[mask]) / std[1] if len(bboxes[i]) > 0: gt_label = labels[i][gt_index] + 1 gt_label[iou.max(axis=1) < thresh] = 0 else: gt_label = xp.zeros(int(mask.sum()), dtype=np.int32) fg_index = xp.where(gt_label > 0)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[_choice(fg_index, size=len(fg_index) - n_fg)] = -1 bg_index = xp.where(gt_label == 0)[0] n_bg = batchsize_per_image - int((gt_label > 0).sum()) if len(bg_index) > n_bg: gt_label[_choice(bg_index, size=len(bg_index) - n_bg)] = -1 gt_locs[mask] = gt_loc gt_labels[mask] = gt_label mask = gt_labels >= 0 rois = rois[mask] roi_indices = roi_indices[mask] roi_levels = roi_levels[mask] gt_locs = gt_locs[mask] gt_labels = gt_labels[mask] masks = [roi_levels == l for l in range(n_level)] rois = [rois[mask] for mask in masks] roi_indices = [roi_indices[mask] for mask in masks] gt_locs = [gt_locs[mask] for mask in masks] gt_labels = [gt_labels[mask] for mask in masks] return rois, roi_indices, gt_locs, gt_labels
def random_crop_with_bbox_constraints( img, bbox, min_scale=0.3, max_scale=1, max_aspect_ratio=2, constraints=None, max_trial=50, return_param=False): """Crop an image randomly with bounding box constraints. This data augmentation is used in training of Single Shot Multibox Detector [#]_. More details can be found in data augmentation section of the original paper. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. Args: img (~numpy.ndarray): An image array to be cropped. This is in CHW format. bbox (~numpy.ndarray): Bounding boxes used for constraints. The shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. min_scale (float): The minimum ratio between a cropped region and the original image. The default value is :obj:`0.3`. max_scale (float): The maximum ratio between a cropped region and the original image. The default value is :obj:`1`. max_aspect_ratio (float): The maximum aspect ratio of cropped region. The default value is :obj:`2`. constaraints (iterable of tuples): An iterable of constraints. Each constraint should be :obj:`(min_iou, max_iou)` format. If you set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`, it means not limited. If this argument is not specified, :obj:`((0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used. max_trial (int): The maximum number of trials to be conducted for each constraint. If this function can not find any region that satisfies the constraint in :math:`max\_trial` trials, this function skips the constraint. The default value is :obj:`50`. return_param (bool): If :obj:`True`, this function returns information of intermediate values. Returns: ~numpy.ndarray or (~numpy.ndarray, dict): If :obj:`return_param = False`, returns an array :obj:`img` that is cropped from the input array. If :obj:`return_param = True`, returns a tuple whose elements are :obj:`img, param`. :obj:`param` is a dictionary of intermediate parameters whose contents are listed below with key, value-type and the description of the value. * **constraint** (*tuple*): The chosen constraint. * **y_slice** (*slice*): A slice in vertical direction used to crop \ the input image. * **x_slice** (*slice*): A slice in horizontal direction used to crop \ the input image. """ if constraints is None: constraints = ( (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1), ) _, H, W = img.shape params = [{ 'constraint': None, 'y_slice': slice(0, H), 'x_slice': slice(0, W)}] if len(bbox) == 0: constraints = [] for min_iou, max_iou in constraints: if min_iou is None: min_iou = 0 if max_iou is None: max_iou = 1 for _ in six.moves.range(max_trial): scale = random.uniform(min_scale, max_scale) aspect_ratio = random.uniform( max(1 / max_aspect_ratio, scale * scale), min(max_aspect_ratio, 1 / (scale * scale))) crop_h = int(H * scale / np.sqrt(aspect_ratio)) crop_w = int(W * scale * np.sqrt(aspect_ratio)) crop_t = random.randrange(H - crop_h) crop_l = random.randrange(W - crop_w) crop_bb = np.array(( crop_t, crop_l, crop_t + crop_h, crop_l + crop_w)) iou = utils.bbox_iou(bbox, crop_bb[np.newaxis]) if min_iou <= iou.min() and iou.max() <= max_iou: params.append({ 'constraint': (min_iou, max_iou), 'y_slice': slice(crop_t, crop_t + crop_h), 'x_slice': slice(crop_l, crop_l + crop_w)}) break param = random.choice(params) img = img[:, param['y_slice'], param['x_slice']] if return_param: return img, param else: return img
def random_crop_with_bbox_constraints(img, bbox, min_scale=0.3, max_scale=1, max_aspect_ratio=2, constraints=None, max_trial=50, return_param=False): if constraints is None: constraints = ( (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1), ) _, H, W = img.shape params = [{ 'constraint': None, 'y_slice': slice(0, H), 'x_slice': slice(0, W) }] if len(bbox) == 0: constraints = list() for min_iou, max_iou in constraints: if min_iou is None: min_iou = 0 if max_iou is None: max_iou = 1 for _ in six.moves.range(max_trial): if min_iou == 0 and max_iou == 1: # IOUを気にせず、bounding box全体を必ず含むような値を取る。 scale = random.uniform(0.9, max_scale) else: scale = random.uniform(min_scale, max_scale) # scale = random.uniform(min_scale, max_scale) aspect_ratio = random.uniform( max(1 / max_aspect_ratio, scale * scale), min(max_aspect_ratio, 1 / (scale * scale))) crop_h = int(H * scale / np.sqrt(aspect_ratio)) crop_w = int(W * scale * np.sqrt(aspect_ratio)) crop_t = random.randrange(H - crop_h) crop_l = random.randrange(W - crop_w) crop_bb = np.array( (crop_t, crop_l, crop_t + crop_h, crop_l + crop_w)) iou = utils.bbox_iou(bbox, crop_bb[np.newaxis]) if min_iou < iou.min() and iou.max() <= max_iou: params.append({ 'constraint': (min_iou, max_iou), 'y_slice': slice(crop_t, crop_t + crop_h), 'x_slice': slice(crop_l, crop_l + crop_w) }) break param = random.choice(params) img = img[:, param['y_slice'], param['x_slice']] if return_param: return img, param else: return img
def rpn_loss(locs, confs, anchors, sizes, bboxes): """Loss function for RPN. Args: locs (iterable of arrays): An iterable of arrays whose shape is :math:`(N, K_l, 4)`, where :math:`K_l` is the number of the anchor boxes of the :math:`l`-th level. confs (iterable of arrays): An iterable of arrays whose shape is :math:`(N, K_l)`. anchors (list of arrays): A list of arrays returned by :meth:`anchors`. sizes (list of tuples of two ints): A list of :math:`(H_n, W_n)`, where :math:`H_n` and :math:`W_n` are height and width of the :math:`n`-th image. bboxes (list of arrays): A list of arrays whose shape is :math:`(R_n, 4)`, where :math:`R_n` is the number of ground truth bounding boxes. Returns: tuple of two variables: :obj:`loc_loss` and :obj:`conf_loss`. """ fg_thresh = 0.7 bg_thresh = 0.3 batchsize_per_image = 256 fg_ratio = 0.25 locs = F.concat(locs) confs = F.concat(confs) xp = cuda.get_array_module(locs.array, confs.array) anchors = xp.vstack(anchors) anchors_yx = (anchors[:, 2:] + anchors[:, :2]) / 2 anchors_hw = anchors[:, 2:] - anchors[:, :2] loc_loss = 0 conf_loss = 0 for i in range(len(sizes)): if len(bboxes[i]) > 0: iou = utils.bbox_iou(anchors, bboxes[i]) gt_loc = bboxes[i][iou.argmax(axis=1)].copy() # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - anchors_yx) / anchors_hw gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / anchors_hw) else: gt_loc = xp.empty_like(anchors) gt_label = xp.empty(len(anchors), dtype=np.int32) gt_label[:] = -1 mask = xp.logical_and(anchors[:, :2] >= 0, anchors[:, 2:] < xp.array(sizes[i])).all(axis=1) if len(bboxes[i]) > 0: gt_label[xp.where(mask)[0][(iou[mask] == iou[mask].max( axis=0)).any(axis=1)]] = 1 gt_label[xp.logical_and(mask, iou.max(axis=1) >= fg_thresh)] = 1 fg_index = xp.where(gt_label == 1)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1 if len(bboxes[i]) > 0: bg_index = xp.where( xp.logical_and(mask, iou.max(axis=1) < bg_thresh))[0] else: bg_index = xp.where(mask)[0] n_bg = batchsize_per_image - int((gt_label == 1).sum()) if len(bg_index) > n_bg: gt_label[bg_index[xp.random.randint(len(bg_index), size=n_bg)]] = 0 n_sample = (gt_label >= 0).sum() loc_loss += F.sum( smooth_l1(locs[i][gt_label == 1], gt_loc[gt_label == 1], 1 / 9)) / n_sample conf_loss += F.sum(F.sigmoid_cross_entropy( confs[i][gt_label >= 0], gt_label[gt_label >= 0], reduce='no')) \ / n_sample loc_loss /= len(sizes) conf_loss /= len(sizes) return loc_loss, conf_loss
def encode(self, bbox, label, iou_thresh=0.5): """Encodes coordinates and classes of bounding boxes. This method encodes :obj:`bbox` and :obj:`label` to :obj:`mb_loc` and :obj:`mb_label`, which are used to compute multibox loss. Args: bbox (array): A float array of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in an image. Each bouding box is organized by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis. label (array) : An integer array of shape :math:`(R,)`. Each value indicates the class of the bounding box. iou_thresh (float): The threshold value to determine a default bounding box is assigned to a ground truth or not. The default value is :obj:`0.5`. Returns: tuple of two arrays: This method returns a tuple of two arrays, :obj:`(mb_loc, mb_label)`. * **mb_loc**: A float array of shape :math:`(K, 4)`, \ where :math:`K` is the number of default bounding boxes. * **mb_label**: An integer array of shape :math:`(K,)`. """ xp = self.xp if len(bbox) == 0: return (xp.zeros(self._default_bbox.shape, dtype=np.float32), xp.zeros(self._default_bbox.shape[0], dtype=np.int32)) iou = utils.bbox_iou( xp.hstack( (self._default_bbox[:, :2] - self._default_bbox[:, 2:] / 2, self._default_bbox[:, :2] + self._default_bbox[:, 2:] / 2)), bbox) index = xp.empty(len(self._default_bbox), dtype=int) # -1 is for background index[:] = -1 masked_iou = iou.copy() while True: i, j = _unravel_index(masked_iou.argmax(), masked_iou.shape) if masked_iou[i, j] <= 1e-6: break index[i] = j masked_iou[i, :] = 0 masked_iou[:, j] = 0 mask = xp.logical_and(index < 0, iou.max(axis=1) >= iou_thresh) index[mask] = iou[mask].argmax(axis=1) mb_bbox = bbox[index].copy() # (y_min, x_min, y_max, x_max) -> (y_min, x_min, height, width) mb_bbox[:, 2:] -= mb_bbox[:, :2] # (y_min, x_min, height, width) -> (center_y, center_x, height, width) mb_bbox[:, :2] += mb_bbox[:, 2:] / 2 mb_loc = xp.empty_like(mb_bbox) mb_loc[:, :2] = (mb_bbox[:, :2] - self._default_bbox[:, :2]) / \ (self._variance[0] * self._default_bbox[:, 2:]) mb_loc[:, 2:] = xp.log(mb_bbox[:, 2:] / self._default_bbox[:, 2:]) / \ self._variance[1] # [0, n_fg_class - 1] -> [1, n_fg_class] mb_label = label[index] + 1 # 0 is for background mb_label[index < 0] = 0 return mb_loc.astype(np.float32), mb_label.astype(np.int32)
def random_crop_with_bbox_constraints(img, bbox, min_scale=0.3, max_scale=1, max_aspect_ratio=2, constraints=None, max_trial=50, return_param=False): """Crop an image randomly with bounding box constraints. This data augmentation is used in training of Single Shot Multibox Detector [#]_. More details can be found in data augmentation section of the original paper. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. Args: img (~numpy.ndarray): An image array to be cropped. This is in CHW format. bbox (~numpy.ndarray): Bounding boxes used for constraints. The shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. min_scale (float): The minimum ratio between a cropped region and the original image. The default value is :obj:`0.3`. max_scale (float): The maximum ratio between a cropped region and the original image. The default value is :obj:`1`. max_aspect_ratio (float): The maximum aspect ratio of cropped region. The default value is :obj:`2`. constaraints (iterable of tuples): An iterable of constraints. Each constraint should be :obj:`(min_iou, max_iou)` format. If you set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`, it means not limited. If this argument is not specified, :obj:`((0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used. max_trial (int): The maximum number of trials to be conducted for each constraint. If this function can not find any region that satisfies the constraint in :math:`max\_trial` trials, this function skips the constraint. The default value is :obj:`50`. return_param (bool): If :obj:`True`, this function returns information of intermediate values. Returns: ~numpy.ndarray or (~numpy.ndarray, dict): If :obj:`return_param = False`, returns an array :obj:`img` that is cropped from the input array. If :obj:`return_param = True`, returns a tuple whose elements are :obj:`img, param`. :obj:`param` is a dictionary of intermediate parameters whose contents are listed below with key, value-type and the description of the value. * **constraint** (*tuple*): The chosen constraint. * **y_slice** (*slice*): A slice in vertical direction used to crop \ the input image. * **x_slice** (*slice*): A slice in horizontal direction used to crop \ the input image. """ if constraints is None: constraints = ( (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1), ) _, H, W = img.shape params = [{ 'constraint': None, 'y_slice': slice(0, H), 'x_slice': slice(0, W) }] if len(bbox) == 0: constraints = list() for min_iou, max_iou in constraints: if min_iou is None: min_iou = 0 if max_iou is None: max_iou = 1 for _ in six.moves.range(max_trial): scale = random.uniform(min_scale, max_scale) aspect_ratio = random.uniform( max(1 / max_aspect_ratio, scale * scale), min(max_aspect_ratio, 1 / (scale * scale))) crop_h = int(H * scale / np.sqrt(aspect_ratio)) crop_w = int(W * scale * np.sqrt(aspect_ratio)) crop_t = random.randrange(H - crop_h) crop_l = random.randrange(W - crop_w) crop_bb = np.array( (crop_t, crop_l, crop_t + crop_h, crop_l + crop_w)) iou = utils.bbox_iou(bbox, crop_bb[np.newaxis]) if min_iou <= iou.min() and iou.max() <= max_iou: params.append({ 'constraint': (min_iou, max_iou), 'y_slice': slice(crop_t, crop_t + crop_h), 'x_slice': slice(crop_l, crop_l + crop_w) }) break param = random.choice(params) img = img[:, param['y_slice'], param['x_slice']] if return_param: return img, param else: return img
def rpn_loss(locs, confs, anchors, sizes, bboxes): fg_thresh = 0.7 bg_thresh = 0.3 batchsize_per_image = 256 fg_ratio = 0.25 locs = F.concat(locs) confs = F.concat(confs) xp = cuda.get_array_module(locs.array, confs.array) anchors = xp.vstack(anchors) anchors_yx = (anchors[:, 2:] + anchors[:, :2]) / 2 anchors_hw = anchors[:, 2:] - anchors[:, :2] loc_loss = 0 conf_loss = 0 for i in range(len(sizes)): if len(bboxes[i]) > 0: iou = utils.bbox_iou(anchors, bboxes[i]) gt_loc = bboxes[i][iou.argmax(axis=1)].copy() # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - anchors_yx) / anchors_hw gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / anchors_hw) else: gt_loc = xp.empty_like(anchors) gt_label = xp.empty(len(anchors), dtype=np.int32) gt_label[:] = -1 mask = xp.logical_and(anchors[:, :2] >= 0, anchors[:, 2:] < xp.array(sizes[i])).all(axis=1) if len(bboxes[i]) > 0: gt_label[xp.where(mask)[0][(iou[mask] == iou[mask].max( axis=0)).any(axis=1)]] = 1 gt_label[xp.logical_and(mask, iou.max(axis=1) >= fg_thresh)] = 1 fg_index = xp.where(gt_label == 1)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[_choice(fg_index, size=len(fg_index) - n_fg)] = -1 if len(bboxes[i]) > 0: bg_index = xp.where( xp.logical_and(mask, iou.max(axis=1) < bg_thresh))[0] else: bg_index = xp.where(mask)[0] n_bg = batchsize_per_image - int((gt_label == 1).sum()) if len(bg_index) > n_bg: gt_label[bg_index[xp.random.randint(len(bg_index), size=n_bg)]] = 0 n_sample = (gt_label >= 0).sum() loc_loss += F.sum( smooth_l1(locs[i][gt_label == 1], gt_loc[gt_label == 1], 1 / 9)) / n_sample conf_loss += F.sum(F.sigmoid_cross_entropy( confs[i][gt_label >= 0], gt_label[gt_label >= 0], reduce='no')) \ / n_sample loc_loss /= len(sizes) conf_loss /= len(sizes) return loc_loss, conf_loss
def bbox_head_loss_pre(rois, roi_indices, std, bboxes, labels): """Loss function for Head (pre). This function processes RoIs for :func:`bbox_head_loss_post`. Args: rois (iterable of arrays): An iterable of arrays of shape :math:`(R_l, 4)`, where :math:`R_l` is the number of RoIs in the :math:`l`-th feature map. roi_indices (iterable of arrays): An iterable of arrays of shape :math:`(R_l,)`. std (tuple of floats): Two coefficients used for encoding bounding boxes. bboxes (list of arrays): A list of arrays whose shape is :math:`(R_n, 4)`, where :math:`R_n` is the number of ground truth bounding boxes. labels (list of arrays): A list of arrays whose shape is :math:`(R_n,)`. Returns: tuple of four lists: :obj:`rois`, :obj:`roi_indices`, :obj:`gt_locs`, and :obj:`gt_labels`. * **rois**: A list of arrays of shape :math:`(R'_l, 4)`, \ where :math:`R'_l` is the number of RoIs in the :math:`l`-th \ feature map. * **roi_indices**: A list of arrays of shape :math:`(R'_l,)`. * **gt_locs**: A list of arrays of shape :math:`(R'_l, 4) \ indicating the bounding boxes of ground truth. * **roi_indices**: A list of arrays of shape :math:`(R'_l,)` \ indicating the classes of ground truth. """ thresh = 0.5 batchsize_per_image = 512 fg_ratio = 0.25 xp = cuda.get_array_module(*rois) n_level = len(rois) roi_levels = xp.hstack( xp.array((l, ) * len(rois[l])) for l in range(n_level)).astype(np.int32) rois = xp.vstack(rois).astype(np.float32) roi_indices = xp.hstack(roi_indices).astype(np.int32) rois_yx = (rois[:, 2:] + rois[:, :2]) / 2 rois_hw = rois[:, 2:] - rois[:, :2] indices = np.unique(cuda.to_cpu(roi_indices)) gt_locs = xp.empty_like(rois) gt_labels = xp.empty_like(roi_indices) for i in indices: mask = roi_indices == i if len(bboxes[i]) > 0: iou = utils.bbox_iou(rois[mask], bboxes[i]) gt_index = iou.argmax(axis=1) gt_loc = bboxes[i][gt_index].copy() else: gt_loc = xp.empty_like(rois[mask]) # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - rois_yx[mask]) / \ rois_hw[mask] / std[0] gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / rois_hw[mask]) / std[1] if len(bboxes[i]) > 0: gt_label = labels[i][gt_index] + 1 gt_label[iou.max(axis=1) < thresh] = 0 else: gt_label = xp.zeros(int(mask.sum()), dtype=np.int32) fg_index = xp.where(gt_label > 0)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1 bg_index = xp.where(gt_label == 0)[0] n_bg = batchsize_per_image - int((gt_label > 0).sum()) if len(bg_index) > n_bg: gt_label[choice(bg_index, size=len(bg_index) - n_bg)] = -1 gt_locs[mask] = gt_loc gt_labels[mask] = gt_label mask = gt_labels >= 0 rois = rois[mask] roi_indices = roi_indices[mask] roi_levels = roi_levels[mask] gt_locs = gt_locs[mask] gt_labels = gt_labels[mask] masks = [roi_levels == l for l in range(n_level)] rois = [rois[m] for m in masks] roi_indices = [roi_indices[m] for m in masks] gt_locs = [gt_locs[m] for m in masks] gt_labels = [gt_labels[m] for m in masks] return rois, roi_indices, gt_locs, gt_labels
def iou_linear_assignment(bbox_a, bbox_b): iou = bbox_iou(bbox_a, bbox_b) indices = linear_assignment(-iou) return indices[:, 0], indices[:, 1]