def test_box_conversions(self, num_boxes): boxes = tf.convert_to_tensor(np.random.rand(num_boxes, 4)) expected_shape = np.array([num_boxes, 4]) xywh_box = box_ops.yxyx_to_xcycwh(boxes) yxyx_box = box_ops.xcycwh_to_yxyx(boxes) self.assertAllEqual(tf.shape(xywh_box).numpy(), expected_shape) self.assertAllEqual(tf.shape(yxyx_box).numpy(), expected_shape)
def __call__(self, pred_boxes, pred_classes, boxes, classes, scale=None, yxyx=True, clip_thresh=0.0): num_boxes = tf.shape(boxes)[-2] num_tiles = (num_boxes // TILE_SIZE) - 1 if yxyx: boxes = box_ops.yxyx_to_xcycwh(boxes) if scale is not None: boxes = boxes * tf.stop_gradient(scale) if self._min_conf > 0.0: pred_classes = tf.cast(pred_classes > self._min_conf, pred_classes.dtype) def _loop_cond(unused_pred_box, unused_pred_class, boxes, unused_classes, unused_running_boxes, unused_running_classes, unused_max_iou, idx): # check that the slice has boxes that all zeros batch_size = tf.shape(boxes)[0] box_slice = tf.slice(boxes, [0, idx * TILE_SIZE, 0], [batch_size, TILE_SIZE, 4]) return tf.logical_and(idx < num_tiles, tf.math.greater(tf.reduce_sum(box_slice), 0)) running_boxes = tf.zeros_like(pred_boxes) running_classes = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1)) max_iou = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1)) max_iou = tf.expand_dims(max_iou, axis=-1) (pred_boxes, pred_classes, boxes, classes, running_boxes, running_classes, max_iou, _) = tf.while_loop(_loop_cond, self._search_body, [ pred_boxes, pred_classes, boxes, classes, running_boxes, running_classes, max_iou, tf.constant(0) ]) mask = tf.cast(max_iou > clip_thresh, running_boxes.dtype) running_boxes *= mask running_classes *= tf.squeeze(mask, axis=-1) max_iou *= mask max_iou = tf.squeeze(max_iou, axis=-1) mask = tf.squeeze(mask, axis=-1) return (tf.stop_gradient(running_boxes), tf.stop_gradient(running_classes), tf.stop_gradient(max_iou), tf.stop_gradient(mask))
def _parse_eval_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ shape = tf.shape(data['image']) image = data['image'] / 255 boxes = data['groundtruth_boxes'] width = shape[0] height = shape[1] image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=self._image_w) boxes = yolo_box_ops.yxyx_to_xcycwh(boxes) # Find the best anchor for the ground truth labels to maximize the iou best_anchors = yolo_preprocess_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) boxes = yolo_preprocess_ops.pad_max_instances(boxes, self._max_num_instances, 0) classes = yolo_preprocess_ops.pad_max_instances( data['groundtruth_classes'], self._max_num_instances, 0) best_anchors = yolo_preprocess_ops.pad_max_instances( best_anchors, self._max_num_instances, 0) area = yolo_preprocess_ops.pad_max_instances(data['groundtruth_area'], self._max_num_instances, 0) is_crowd = yolo_preprocess_ops.pad_max_instances( tf.cast(data['groundtruth_is_crowd'], tf.int32), self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'area': tf.cast(area, self._dtype), 'is_crowd': is_crowd, 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': tf.shape(data['groundtruth_classes'])[0], } grid = self._build_grid(labels, self._image_w, batch=False, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) return image, labels
def fit_preserve_aspect_ratio(image, boxes, width=None, height=None, target_dim=None): """Resizes the image while peserving the image aspect ratio. Args: image: a `Tensor` representing the image. boxes: a `Tensor` representing the boxes. width: int for the image width. height: int for the image height. target_dim: list or a Tensor of height and width. Returns: image: a `Tensor` representing the image. box: a `Tensor` representing the boxes. """ if width is None or height is None: shape = tf.shape(image) if tf.shape(shape)[0] == 4: width = shape[1] height = shape[2] else: width = shape[0] height = shape[1] clipper = tf.math.maximum(width, height) if target_dim is None: target_dim = clipper pad_width = clipper - width pad_height = clipper - height image = tf.image.pad_to_bounding_box(image, pad_width // 2, pad_height // 2, clipper, clipper) boxes = box_ops.yxyx_to_xcycwh(boxes) x, y, w, h = tf.split(boxes, 4, axis=-1) y *= tf.cast(width / clipper, tf.float32) x *= tf.cast(height / clipper, tf.float32) y += tf.cast((pad_width / clipper) / 2, tf.float32) x += tf.cast((pad_height / clipper) / 2, tf.float32) h *= tf.cast(width / clipper, tf.float32) w *= tf.cast(height / clipper, tf.float32) boxes = tf.concat([x, y, w, h], axis=-1) boxes = box_ops.xcycwh_to_yxyx(boxes) image = tf.image.resize(image, (target_dim, target_dim)) return image, boxes
def __call__(self, boxes, classes, width, height): """Builds the labels for a single image, not functional in batch mode. Args: boxes: `Tensor` of shape [None, 4] indicating the object locations in an image. classes: `Tensor` of shape [None] indicating the each objects classes. width: `int` for the images width. height: `int` for the images height. Returns: centers: `Tensor` of shape [None, 3] of indexes in the final grid where boxes are located. updates: `Tensor` of shape [None, 8] the value to place in the final grid. full: `Tensor` of [width/stride, height/stride, num_anchors, 1] holding a mask of where boxes are locates for confidence losses. """ indexes = {} updates = {} true_grids = {} iou_index = None boxes = box_ops.yxyx_to_xcycwh(boxes) if not self.best_matches_only and self.anchor_free_level_limits is None: # stitch and search boxes across fpn levels anchorsvec = [] for stitch in self.anchors: anchorsvec.extend(self.anchors[stitch]) stride = tf.cast([width, height], boxes.dtype) # get the best anchor for each box iou_index, _ = get_best_anchor( boxes, anchorsvec, stride, width=1.0, height=1.0, best_match_only=False, use_tie_breaker=self.use_tie_breaker, iou_thresh=self.match_threshold) for key in self.keys: indexes[key], updates[key], true_grids[ key] = self.build_label_per_path(key, boxes, classes, width, height, iou_index=iou_index) return indexes, updates, true_grids
def _tiled_global_box_search(self, pred_boxes, pred_classes, boxes, classes, true_conf, smoothed, scale=None): """Search of all groundtruths to associate groundtruths to predictions.""" boxes = box_ops.yxyx_to_xcycwh(boxes) if scale is not None: boxes = boxes * tf.cast(tf.stop_gradient(scale), boxes.dtype) # Search all predictions against ground truths to find mathcing boxes for # each pixel. _, _, iou_max, _ = self._search_pairs(pred_boxes, pred_classes, boxes, classes) if iou_max is None: return true_conf, tf.ones_like(true_conf) # Find the exact indexes to ignore and keep. ignore_mask = tf.cast(iou_max < self._ignore_thresh, pred_boxes.dtype) iou_mask = iou_max > self._ignore_thresh if not smoothed: # Ignore all pixels where a box was not supposed to be predicted but a # high confidence box was predicted. obj_mask = true_conf + (1 - true_conf) * ignore_mask else: # Replace pixels in the tre confidence map with the max iou predicted # with in that cell. obj_mask = tf.ones_like(true_conf) iou_ = (1 - self._objectness_smooth ) + self._objectness_smooth * iou_max iou_ = tf.where(iou_max > 0, iou_, tf.zeros_like(iou_)) true_conf = tf.where(iou_mask, iou_, true_conf) # Stop gradient so while loop is not tracked. obj_mask = tf.stop_gradient(obj_mask) true_conf = tf.stop_gradient(true_conf) return true_conf, obj_mask
def _parse_eval_data(self, data): """Parses data for evaluation. !!! All augmentations and transformations are on bboxes with format (ymin, xmin, ymax, xmax). Required to do the appropriate transformations. !!! Images are supposed to be in RGB format """ image, boxes = data['image'], data['boxes'] image, image_info = preprocess_ops.resize_and_crop_image( image, self._input_size[:2], self._input_size[:2], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max, preserve_aspect_ratio=self._preserve_aspect_ratio) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) image = preprocess_ops.normalize_image(image, offset=MEAN_RGB, scale=STDDEV_RGB) image = tf.cast(image, dtype=self._dtype) boxes = tf.clip_by_value(boxes, 0, self._input_size[0] - 1) bbox_labels = yolo_box_ops.yxyx_to_xcycwh(boxes) bbox_labels = tf.concat([bbox_labels, data['classes'][:, tf.newaxis]], axis=-1) labels, bbox_labels = yolo_ops.preprocess_true_boxes( bboxes=bbox_labels, train_output_sizes=self.train_output_sizes, anchor_per_scale=self.anchor_per_scale, num_classes=self.num_classes, max_bbox_per_scale=self.max_bbox_per_scale, strides=self.strides, anchors=self.anchors) targets = {'labels': labels, 'bboxes': bbox_labels} return image, targets
def _parse_train_data(self, data): """Parses data for training and evaluation. !!! All augmentations and transformations are on bboxes with format (ymin, xmin, ymax, xmax). Required to do the appropriate transformations. !!! Images are supposed to be in RGB format """ image, boxes = data['image'], data['boxes'] # Execute RandAugment first as some ops require uint8 colors if self._augmenter is not None: image = self._augmenter.distort(image) if self._aug_rand_hflip: image, boxes = yolo_ops.random_horizontal_flip(image, boxes) image, image_info = preprocess_ops.resize_and_crop_image( image, self._input_size[:2], self._input_size[:2], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max, preserve_aspect_ratio=self._preserve_aspect_ratio) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) if self._aug_jitter_im != 0.0: image, boxes = yolo_ops.random_translate(image, boxes, self._aug_jitter_im) if self._aug_jitter_boxes != 0.0: boxes = box_ops.jitter_boxes(boxes, self._aug_jitter_boxes) image = preprocess_ops.normalize_image(image, offset=MEAN_RGB, scale=STDDEV_RGB) image = tf.cast(image, dtype=self._dtype) boxes = tf.clip_by_value(boxes, 0, self._input_size[0] - 1) bbox_labels = yolo_box_ops.yxyx_to_xcycwh(boxes) bbox_labels = tf.concat([bbox_labels, data['classes'][:, tf.newaxis]], axis=-1) labels, bbox_labels = yolo_ops.preprocess_true_boxes( bboxes=bbox_labels, train_output_sizes=self.train_output_sizes, anchor_per_scale=self.anchor_per_scale, num_classes=self.num_classes, max_bbox_per_scale=self.max_bbox_per_scale, strides=self.strides, anchors=self.anchors) # TODO: Figure out why we need to fix the num BBOX if not there will be an error # https://github.com/whizzmobility/models/pull/61 # pad / limit to MAX_DISPLAY_BBOX boxes for constant size raw_bboxes = boxes num_bboxes = tf.shape(raw_bboxes)[0] if num_bboxes > MAX_DISPLAY_BBOX: raw_bboxes = raw_bboxes[:, :MAX_DISPLAY_BBOX] else: paddings = tf.stack([0, MAX_DISPLAY_BBOX - num_bboxes], axis=-1) paddings = tf.stack([paddings, [0, 0]], axis=0) raw_bboxes = tf.pad(raw_bboxes, paddings) targets = { 'labels': labels, 'bboxes': bbox_labels, 'raw_bboxes': raw_bboxes } return image, targets
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ shape = tf.shape(data['image']) image = data['image'] / 255 boxes = data['groundtruth_boxes'] width = shape[0] height = shape[1] image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=self._max_process_size) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) randscale = self._image_w // self._net_down_scale if not self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5) if do_scale: # This scales the image to a random multiple of net_down_scale # between 320 to 608 randscale = tf.random.uniform( [], minval=self._min_process_size // self._net_down_scale, maxval=self._max_process_size // self._net_down_scale, seed=self._seed, dtype=tf.int32) * self._net_down_scale if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) # YOLO loss function uses x-center, y-center format boxes = yolo_box_ops.yxyx_to_xcycwh(boxes) if self._jitter_im != 0.0: image, boxes = yolo_preprocess_ops.random_translate( image, boxes, self._jitter_im, seed=self._seed) if self._aug_rand_zoom: image, boxes = yolo_preprocess_ops.resize_crop_filter( image, boxes, default_width=self._image_w, default_height=self._image_h, target_width=randscale, target_height=randscale) image = tf.image.resize(image, (416, 416), preserve_aspect_ratio=False) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.1) # Brightness if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.75, upper=1.25) # Saturation if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.3) # Hue image = tf.clip_by_value(image, 0.0, 1.0) # Find the best anchor for the ground truth labels to maximize the iou best_anchors = yolo_preprocess_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) # Padding boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_classes'], self._max_num_instances, -1) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) area = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_area'], self._max_num_instances, 0) is_crowd = preprocess_ops.clip_or_pad_to_fixed_size( tf.cast(data['groundtruth_is_crowd'], tf.int32), self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'area': tf.cast(area, self._dtype), 'is_crowd': is_crowd, 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': tf.shape(data['groundtruth_classes'])[0], } if self._fixed_size: grid = self._build_grid(labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) return image, labels
def testYoloPreprocessTrueBoxes(self): bboxes = tf.constant([[40, 79, 109, 144, 74], [174, 242, 187, 269, 24], [341, 265, 357, 291, 26], [261, 220, 300, 362, 0], [217, 228, 252, 338, 0], [202, 228, 219, 274, 0], [135, 232, 153, 278, 0], [94, 229, 124, 306, 0], [44, 232, 74, 321, 0], [191, 238, 196, 255, 24], [117, 86, 122, 157, 74], [180, 224, 193, 285, 0], [375, 226, 415, 326, 0], [245, 222, 274, 317, 0], [317, 228, 352, 334, 0], [369, 226, 389, 263, 0], [135, 225, 180, 355, 0], [171, 229, 185, 311, 0], [0, 216, 415, 363, 0]]) # x1, y1, x2, y2, class classes = bboxes[:, 4] bboxes = tf.stack( [bboxes[:, 1], bboxes[:, 0], bboxes[:, 3], bboxes[:, 2]], axis=-1) #yxyx bboxes = box_ops.yxyx_to_xcycwh(tf.cast(bboxes, tf.float32)) inputs = tf.concat( [bboxes, tf.cast(classes[:, tf.newaxis], tf.float32)], axis=-1) train_output_sizes = tf.constant([52, 26, 13]) anchor_per_scale = 3 num_classes = 80 max_bbox_per_scale = 150 strides = tf.constant([8, 16, 32]) anchors = tf.constant([[[12, 16], [19, 36], [40, 28]], [[36, 75], [76, 55], [72, 146]], [[142, 110], [192, 243], [459, 401]]]) result = yolo_ops.preprocess_true_boxes( bboxes=inputs, train_output_sizes=train_output_sizes, anchor_per_scale=anchor_per_scale, num_classes=num_classes, max_bbox_per_scale=max_bbox_per_scale, strides=strides, anchors=anchors) # only takes xywh target_labels, target_bboxes = result groundtruth_label_small_bbox = np.array([ 74.5, 111.5, 69., 65., 1., 1., 119.5, 121.5, 5., 71., 1., 1., 193.5, 246.5, 5., 17., 1., 1., 379., 244.5, 20., 37., 1., 1., 144., 255., 18., 46., 1., 1., 180.5, 255.5, 13., 27., 1., 1., 186.5, 254.5, 13., 61., 1., 1., 210.5, 251., 17., 46., 1., 1., 109., 267.5, 30., 77., 1., 1., 178., 270., 14., 82., 1., 1., 259.5, 269.5, 29., 95., 1., 1., 59., 276.5, 30., 89., 1., 1., 349., 278., 16., 26., 1., 1., 395., 276., 40., 100., 1., 1., 234.5, 283., 35., 110., 1., 1., 334.5, 281., 35., 106., 1., 1., 157.5, 290., 45., 130., 1., 1., 207.5, 289.5, 415., 147., 1., 1., 280.5, 291., 39., 142., 1., 1. ]) groundtruth_small_bbox = np.array([ 74.5, 111.5, 69., 65., 180.5, 255.5, 13., 27., 349., 278., 16., 26., 280.5, 291., 39., 142., 234.5, 283., 35., 110., 210.5, 251., 17., 46., 144., 255., 18., 46., 109., 267.5, 30., 77., 59., 276.5, 30., 89., 193.5, 246.5, 5., 17., 119.5, 121.5, 5., 71., 186.5, 254.5, 13., 61., 395., 276., 40., 100., 259.5, 269.5, 29., 95., 334.5, 281., 35., 106., 379., 244.5, 20., 37., 157.5, 290., 45., 130., 178., 270., 14., 82., 207.5, 289.5, 415., 147. ]) self.assertAllClose( tf.boolean_mask(target_labels[0], tf.greater(target_labels[0], 0.5)), groundtruth_label_small_bbox) self.assertAllClose( tf.boolean_mask(target_bboxes[0], tf.greater(target_bboxes[0], 0.5)), groundtruth_small_bbox) self.assertAllEqual(target_labels[0].shape, np.array([52, 52, 3, 85])) self.assertAllEqual(target_bboxes[0].shape, np.array([150, 4])) self.assertAllEqual(target_labels[1], tf.zeros([26, 26, 3, 85])) self.assertAllEqual(target_bboxes[1], tf.zeros([150, 4])) self.assertAllEqual(target_labels[2], tf.zeros([13, 13, 3, 85])) self.assertAllEqual(target_bboxes[2], tf.zeros([150, 4]))