def testAffineWarpBoxes(self, affine, num_boxes): boxes = tf.convert_to_tensor(np.random.rand(num_boxes, 4)) boxes = bbox_ops.denormalize_boxes(boxes, affine[0]) processed_boxes, _ = preprocessing_ops.affine_warp_boxes( tf.cast(affine[2], tf.double), boxes, affine[1], box_history=boxes) processed_boxes_shape = tf.shape(processed_boxes) self.assertAllEqual([num_boxes, 4], processed_boxes_shape.numpy())
def reverse_input_box_transformation(boxes, image_info): """Reverse the Mask R-CNN model's input boxes tranformation. Args: boxes: A [batch_size, num_boxes, 4] float tensor of boxes in normalized coordinates. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [desired_height, desired_width], [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, desired_width] is the actual scaled image size, and [y_scale, x_scale] is the scaling factor, which is the ratio of scaled dimension / original dimension. Returns: boxes: Same shape as input `boxes` but in the absolute coordinate space of the preprocessed image. """ # Reversing sequence from Detection_module.serve when # output_normalized_coordinates=true scale = image_info[:, 2:3, :] scale = tf.tile(scale, [1, 1, 2]) boxes = boxes * scale height_width = image_info[:, 0:1, :] return box_ops.denormalize_boxes(boxes, height_width)
def _mosaic_crop_image(self, image, boxes, classes, is_crowd, area): """Process a patched image in preperation for final output.""" if self._mosaic_crop_mode != 'crop': shape = tf.cast(preprocessing_ops.get_image_shape(image), tf.float32) center = shape * self._mosaic_center # shift the center of the image by applying a translation to the whole # image ch = tf.math.round( preprocessing_ops.random_uniform_strong(-center[0], center[0], seed=self._seed)) cw = tf.math.round( preprocessing_ops.random_uniform_strong(-center[1], center[1], seed=self._seed)) # clip the boxes to those with in the image image = tfa.image.translate(image, [cw, ch], fill_value=self._pad_value) boxes = box_ops.denormalize_boxes(boxes, shape[:2]) boxes = boxes + tf.cast([ch, cw, ch, cw], boxes.dtype) boxes = box_ops.clip_boxes(boxes, shape[:2]) inds = box_ops.get_non_empty_box_indices(boxes) boxes = box_ops.normalize_boxes(boxes, shape[:2]) boxes, classes, is_crowd, area = self._select_ind( inds, boxes, classes, # pylint:disable=unbalanced-tuple-unpacking is_crowd, area) # warp and scale the fully stitched sample image, _, affine = preprocessing_ops.affine_warp_image( image, [self._output_size[0], self._output_size[1]], scale_min=self._aug_scale_min, scale_max=self._aug_scale_max, translate=self._aug_rand_translate, degrees=self._aug_rand_angle, perspective=self._aug_rand_perspective, random_pad=self._random_pad, seed=self._seed) height, width = self._output_size[0], self._output_size[1] image = tf.image.resize(image, (height, width)) # clip and clean boxes boxes, inds = preprocessing_ops.transform_and_clip_boxes( boxes, None, affine=affine, area_thresh=self._area_thresh, seed=self._seed) classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area) # pylint:disable=unbalanced-tuple-unpacking return image, boxes, classes, is_crowd, area, area
def _parse_eval_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] is_crowd = data['groundtruth_is_crowd'] # Gets original image and its size. image = data['image'] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) scales = tf.constant([self._resize_scales[-1]], tf.float32) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) gt_boxes = boxes short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0], self._output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_boxes) } labels.update({ 'id': int(data['source_id']), 'image_info': image_info, 'is_crowd': preprocess_ops.clip_or_pad_to_fixed_size(is_crowd, self._max_num_boxes), 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size(gt_boxes, self._max_num_boxes), }) return image, labels
def _parse_eval_data(self, data): """Generates images and labels that are usable for model evaluation. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = tf.cast(data['image'], dtype=tf.float32) boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] image_shape = tf.shape(input=image)[0:2] # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, [self._output_height, self._output_width], padded_size=[self._output_height, self._output_width], aug_scale_min=1.0, aug_scale_max=1.0) unpad_image_shape = tf.cast(tf.shape(image), tf.float32) # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) labels = self._build_label(unpad_image_shape=unpad_image_shape, boxes=boxes, classes=classes, image_info=image_info, data=data) if self._bgr_ordering: red, green, blue = tf.unstack(image, num=3, axis=2) image = tf.stack([blue, green, red], axis=2) image = preprocess_ops.normalize_image(image=image, offset=self._channel_means, scale=self._channel_stds) image = tf.cast(image, self._dtype) return image, labels
def testBoxCandidates(self, output_size, boxes): boxes = tf.cast(bbox_ops.denormalize_boxes(boxes, output_size), tf.double) clipped_ind = preprocessing_ops.boxes_candidates(boxes, boxes, ar_thr=1e32, wh_thr=0, area_thr=tf.cast( 0, tf.double)) clipped_ind_shape = tf.shape(clipped_ind) self.assertAllEqual([3], clipped_ind_shape.numpy()) self.assertAllEqual([0, 1, 2], clipped_ind.numpy())
def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of groundtruth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the groundtruth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask, regenerate_source_id=self._regenerate_source_id) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_ops.denormalize_boxes(decoded_tensors['groundtruth_boxes'], image_size) source_id = decoded_tensors['source_id'] if source_id.dtype is tf.string: source_id = tf.strings.to_number(source_id, out_type=tf.int64) groundtruths = { 'source_id': source_id, 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths
def scale_boxes(self, patch, ishape, boxes, classes, xs, ys): """Scale and translate the boxes for each image prior to patching.""" xs = tf.cast(xs, boxes.dtype) ys = tf.cast(ys, boxes.dtype) pshape = tf.cast(tf.shape(patch), boxes.dtype) ishape = tf.cast(ishape, boxes.dtype) translate = tf.cast((ishape - pshape), boxes.dtype) boxes = box_ops.denormalize_boxes(boxes, pshape[:2]) boxes = boxes + tf.cast([ translate[0] * ys, translate[1] * xs, translate[0] * ys, translate[1] * xs ], boxes.dtype) boxes = box_ops.normalize_boxes(boxes, ishape[:2]) return boxes, classes
def _build_label(self, boxes, classes, image_info, unpad_image_shape, data): # Sets up groundtruth data for evaluation. groundtruths = { 'source_id': data['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_classes'])[0], 'boxes': box_ops.denormalize_boxes(data['groundtruth_boxes'], tf.shape(input=data['image'])[0:2]), 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), } groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) labels = { 'boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), 'image_info': image_info, 'unpad_image_shapes': unpad_image_shape, 'groundtruths': groundtruths } return labels
def _reorg_boxes(self, boxes, info, num_detections): """Scale and Clean boxes prior to Evaluation.""" mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1]) mask = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype) # Denormalize the boxes by the shape of the image inshape = tf.expand_dims(info[:, 1, :], axis=1) ogshape = tf.expand_dims(info[:, 0, :], axis=1) scale = tf.expand_dims(info[:, 2, :], axis=1) offset = tf.expand_dims(info[:, 3, :], axis=1) boxes = box_ops.denormalize_boxes(boxes, inshape) boxes = box_ops.clip_boxes(boxes, inshape) boxes += tf.tile(offset, [1, 1, 2]) boxes /= tf.tile(scale, [1, 1, 2]) boxes = box_ops.clip_boxes(boxes, ogshape) # Mask the boxes for usage boxes *= mask boxes += (mask - 1) return boxes
def undo_info(boxes: tf.Tensor, num_detections: int, info: tf.Tensor, expand: bool = True) -> tf.Tensor: """Clip and normalize boxes for serving.""" mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1]) boxes = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype) * boxes if expand: info = tf.cast(tf.expand_dims(info, axis=0), boxes.dtype) inshape = tf.expand_dims(info[:, 1, :], axis=1) ogshape = tf.expand_dims(info[:, 0, :], axis=1) scale = tf.expand_dims(info[:, 2, :], axis=1) offset = tf.expand_dims(info[:, 3, :], axis=1) boxes = box_ops.denormalize_boxes(boxes, inshape) boxes += tf.tile(offset, [1, 1, 2]) boxes /= tf.tile(scale, [1, 1, 2]) boxes = box_ops.clip_boxes(boxes, ogshape) boxes = box_ops.normalize_boxes(boxes, ogshape) return boxes
def _parse_train_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] + self._class_offset boxes = data['groundtruth_boxes'] is_crowd = data['groundtruth_is_crowd'] # Gets original image. image = data['image'] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) do_crop = tf.greater(tf.random.uniform([]), 0.5) if do_crop: # Rescale boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2]) index = tf.random.categorical(tf.zeros([1, 3]), 1)[0] scales = tf.gather([400.0, 500.0, 600.0], index, axis=0) short_side = scales[0] image, image_info = preprocess_ops.resize_image(image, short_side) boxes = preprocess_ops.resize_and_crop_boxes( boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Do croping shape = tf.cast(image_info[1], dtype=tf.int32) h = tf.random.uniform([], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32) w = tf.random.uniform([], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32) i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32) j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32) image = tf.image.crop_to_bounding_box(image, i, j, h, w) boxes = tf.clip_by_value( (boxes[..., :] * tf.cast(tf.stack([shape[0], shape[1], shape[0], shape[1]]), dtype=tf.float32) - tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) / tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0) scales = tf.constant(self._resize_scales, dtype=tf.float32) index = tf.random.categorical(tf.zeros([1, 11]), 1)[0] scales = tf.gather(scales, index, axis=0) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0], self._output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_boxes) } return image, labels
def preprocess(self, inputs): """Preprocess COCO for DETR.""" image = inputs['image'] boxes = inputs['objects']['bbox'] classes = inputs['objects']['label'] + 1 is_crowd = inputs['objects']['is_crowd'] image = preprocess_ops.normalize_image(image) if self._params.is_training: image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) do_crop = tf.greater(tf.random.uniform([]), 0.5) if do_crop: # Rescale boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2]) index = tf.random.categorical(tf.zeros([1, 3]), 1)[0] scales = tf.gather([400.0, 500.0, 600.0], index, axis=0) short_side = scales[0] image, image_info = preprocess_ops.resize_image(image, short_side) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Do croping shape = tf.cast(image_info[1], dtype=tf.int32) h = tf.random.uniform( [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32) w = tf.random.uniform( [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32) i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32) j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32) image = tf.image.crop_to_bounding_box(image, i, j, h, w) boxes = tf.clip_by_value( (boxes[..., :] * tf.cast( tf.stack([shape[0], shape[1], shape[0], shape[1]]), dtype=tf.float32) - tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) / tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0) scales = tf.constant( self._params.resize_scales, dtype=tf.float32) index = tf.random.categorical(tf.zeros([1, 11]), 1)[0] scales = tf.gather(scales, index, axis=0) else: scales = tf.constant([self._params.resize_scales[-1]], tf.float32) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) gt_boxes = boxes short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._params.output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box( image, 0, 0, self._params.output_size[0], self._params.output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size( classes, self._params.max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._params.max_num_boxes) } if not self._params.is_training: labels.update({ 'id': inputs['image/id'], 'image_info': image_info, 'is_crowd': preprocess_ops.clip_or_pad_to_fixed_size( is_crowd, self._params.max_num_boxes), 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size( gt_boxes, self._params.max_num_boxes), }) return image, labels
def _parse_train_data(self, data): """Parses data for training.""" # Initialize the shape constants. image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] if self._random_flip: # Randomly flip the image horizontally. image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) if not data['is_mosaic']: image, infos, affine = self._jitter_scale( image, [self._image_h, self._image_w], self._letter_box, self._jitter, self._random_pad, self._aug_scale_min, self._aug_scale_max, self._aug_rand_translate, self._aug_rand_angle, self._aug_rand_perspective) # Clip and clean boxes. boxes, inds = preprocessing_ops.transform_and_clip_boxes( boxes, infos, affine=affine, shuffle_boxes=False, area_thresh=self._area_thresh, filter_and_clip_boxes=True, seed=self._seed) classes = tf.gather(classes, inds) info = infos[-1] else: image = tf.image.resize(image, (self._image_h, self._image_w), method='nearest') output_size = tf.cast([self._image_h, self._image_w], tf.float32) boxes_ = bbox_ops.denormalize_boxes(boxes, output_size) inds = bbox_ops.get_non_empty_box_indices(boxes_) boxes = tf.gather(boxes, inds) classes = tf.gather(classes, inds) info = self._pad_infos_object(image) # Apply scaling to the hue saturation and brightness of an image. image = tf.cast(image, dtype=self._dtype) image = image / 255.0 image = preprocessing_ops.image_rand_hsv( image, self._aug_rand_hue, self._aug_rand_saturation, self._aug_rand_brightness, seed=self._seed, darknet=self._darknet or self._level_limits is not None) # Cast the image to the selcted datatype. image, labels = self._build_label(image, boxes, classes, info, inds, data, is_training=True) return image, labels
def _build_label(self, image, gt_boxes, gt_classes, info, inds, data, is_training=True): """Label construction for both the train and eval data.""" width = self._image_w height = self._image_h # Set the image shape. imshape = image.get_shape().as_list() imshape[-1] = 3 image.set_shape(imshape) labels = dict() (labels['inds'], labels['upds'], labels['true_conf']) = self._label_builder(gt_boxes, gt_classes, width, height) # Set/fix the boxes shape. boxes = self.set_shape(gt_boxes, pad_axis=0, pad_value=0) classes = self.set_shape(gt_classes, pad_axis=0, pad_value=-1) # Build the dictionary set. labels.update({ 'source_id': utils.process_source_id(data['source_id']), 'bbox': tf.cast(boxes, dtype=self._dtype), 'classes': tf.cast(classes, dtype=self._dtype), }) # Update the labels dictionary. if not is_training: # Sets up groundtruth data for evaluation. groundtruths = { 'source_id': labels['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_boxes'])[0], 'image_info': info, 'boxes': bbox_ops.denormalize_boxes( data['groundtruth_boxes'], tf.cast([data['height'], data['width']], gt_boxes.dtype)), 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(tf.gather(data['groundtruth_is_crowd'], inds), tf.int32), } groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) labels['groundtruths'] = groundtruths return image, labels
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. We use random flip, random scaling (between 0.6 to 1.3), cropping, and color jittering as data augmentation Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = tf.cast(data['image'], dtype=tf.float32) boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] image_shape = tf.shape(input=image)[0:2] if self._aug_rand_hflip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Image augmentation if not self._odapi_augmentation: # Color and lighting jittering if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.02) if self._aug_rand_contrast: image = tf.image.random_contrast(image=image, lower=0.8, upper=1.25) if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.8, upper=1.25) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.2) image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, [self._output_height, self._output_width], padded_size=[self._output_height, self._output_width], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) unpad_image_shape = tf.cast(tf.shape(image), tf.float32) # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes( boxes, image_scale, image_info[1, :], offset) else: # Color and lighting jittering if self._aug_rand_hue: image = cn_prep_ops.random_adjust_hue(image=image, max_delta=.02) if self._aug_rand_contrast: image = cn_prep_ops.random_adjust_contrast(image=image, min_delta=0.8, max_delta=1.25) if self._aug_rand_saturation: image = cn_prep_ops.random_adjust_saturation(image=image, min_delta=0.8, max_delta=1.25) if self._aug_rand_brightness: image = cn_prep_ops.random_adjust_brightness(image=image, max_delta=.2) sc_image, sc_boxes, classes = cn_prep_ops.random_square_crop_by_scale( image=image, boxes=boxes, labels=classes, scale_min=self._aug_scale_min, scale_max=self._aug_scale_max) image, unpad_image_shape = cn_prep_ops.resize_to_range( image=sc_image, min_dimension=self._output_width, max_dimension=self._output_width, pad_to_max_dimension=True) preprocessed_shape = tf.cast(tf.shape(image), tf.float32) unpad_image_shape = tf.cast(unpad_image_shape, tf.float32) im_box = tf.stack([ 0.0, 0.0, preprocessed_shape[0] / unpad_image_shape[0], preprocessed_shape[1] / unpad_image_shape[1] ]) realigned_bboxes = box_list_ops.change_coordinate_frame( boxlist=box_list.BoxList(sc_boxes), window=im_box) valid_boxes = box_list_ops.assert_or_prune_invalid_boxes( realigned_bboxes.get()) boxes = box_list_ops.to_absolute_coordinates( boxlist=box_list.BoxList(valid_boxes), height=self._output_height, width=self._output_width).get() image_info = tf.stack([ tf.cast(image_shape, dtype=tf.float32), tf.constant([self._output_height, self._output_width], dtype=tf.float32), tf.cast(tf.shape(sc_image)[0:2] / image_shape, dtype=tf.float32), tf.constant([0., 0.]) ]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) labels = self._build_label(unpad_image_shape=unpad_image_shape, boxes=boxes, classes=classes, image_info=image_info, data=data) if self._bgr_ordering: red, green, blue = tf.unstack(image, num=3, axis=2) image = tf.stack([blue, green, red], axis=2) image = preprocess_ops.normalize_image(image=image, offset=self._channel_means, scale=self._channel_stds) image = tf.cast(image, self._dtype) return image, labels
def _parse_train_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] # If not empty, `attributes` is a dict of (name, ground_truth) pairs. # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. # TODO(xianzhi): support parsing attributes weights. attributes = data.get('groundtruth_attributes', {}) is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtrtuhs = tf.shape(input=classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( pred=tf.greater(tf.size(input=is_crowds), 0), true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0], false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf. int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Gets original image. image = data['image'] # Apply autoaug or randaug. if self._augmenter is not None: image, boxes = self._augmenter.distort_with_boxes(image, boxes) image_shape = tf.shape(input=image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Assigns anchors. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, att_targets, cls_weights, box_weights) = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) # Casts input image to desired data type. image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': anchor_boxes, 'cls_weights': cls_weights, 'box_weights': box_weights, 'image_info': image_info, } if att_targets: labels['attribute_targets'] = att_targets return image, labels
def _parse_eval_data(self, data): """Parses data for evaluation. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: A dictionary of {'images': image, 'labels': labels} where image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. """ # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2 ** self._max_level), aug_scale_min=1.0, aug_scale_max=1.0) image_height, image_width, _ = image.get_shape().as_list() # Casts input image to self._dtype image = tf.cast(image, dtype=self._dtype) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape) # Compute Anchor boxes. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) labels = { 'image_info': image_info, 'anchor_boxes': anchor_boxes, } groundtruths = { 'source_id': data['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_classes'])[0], 'boxes': boxes, 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), } groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) labels['groundtruths'] = groundtruths return image, labels
def _parse_eval_data(self, data): """Parses data for training and evaluation.""" groundtruths = {} classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] # If not empty, `attributes` is a dict of (name, ground_truth) pairs. # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. # TODO(xianzhi): support parsing attributes weights. attributes = data.get('groundtruth_attributes', {}) # Gets original image and its size. image = data['image'] image_shape = tf.shape(input=image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=1.0, aug_scale_max=1.0) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Assigns anchors. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, att_targets, cls_weights, box_weights) = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) # Casts input image to desired data type. image = tf.cast(image, dtype=self._dtype) # Sets up groundtruth data for evaluation. groundtruths = { 'source_id': data['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_classes']), 'image_info': image_info, 'boxes': box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape), 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), } if 'groundtruth_attributes' in data: groundtruths['attributes'] = data['groundtruth_attributes'] groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': anchor_boxes, 'cls_weights': cls_weights, 'box_weights': box_weights, 'image_info': image_info, 'groundtruths': groundtruths, } if att_targets: labels['attribute_targets'] = att_targets return image, labels
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtruths = tf.shape(classes)[0] with tf.control_dependencies([num_groundtruths, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtruths), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] if self._augmenter is not None: image = self._augmenter.distort(image) image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = preprocess_ops.random_horizontal_flip( image, boxes, masks) else: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2 ** self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes( boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler( self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # Casts input image to self._dtype image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': anchor_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'gt_classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), } if self._include_mask: labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def transform_and_clip_boxes(boxes, infos, affine=None, shuffle_boxes=False, area_thresh=0.1, seed=None, filter_and_clip_boxes=True): """Clips and cleans the boxes. Args: boxes: A `Tensor` for the boxes. infos: A `list` that contains the image infos. affine: A `list` that contains parameters for resize and crop. shuffle_boxes: A `bool` for shuffling the boxes. area_thresh: An `int` for the area threshold. seed: seed for random number generation. filter_and_clip_boxes: A `bool` for filtering and clipping the boxes to [0, 1]. Returns: boxes: A `Tensor` representing the augmented boxes. ind: A `Tensor` valid box indices. """ # Clip and clean boxes. def get_valid_boxes(boxes): """Get indices for non-empty boxes.""" # Convert the boxes to center width height formatting. height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] base = tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)) return base # Initialize history to track operation applied to boxes box_history = boxes # Make sure all boxes are valid to start, clip to [0, 1] and get only the # valid boxes. output_size = None if filter_and_clip_boxes: boxes = tf.math.maximum(tf.math.minimum(boxes, 1.0), 0.0) cond = get_valid_boxes(boxes) if infos is None: infos = [] for info in infos: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, info[0]) box_history = bbox_ops.denormalize_boxes(box_history, info[0]) # Shift and scale all boxes, and keep track of box history with no # box clipping, history is used for removing boxes that have become # too small or exit the image area. (boxes, box_history) = resize_and_crop_boxes( boxes, info[2, :], info[1, :], info[3, :], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = info[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) if affine is not None: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, affine[0]) box_history = bbox_ops.denormalize_boxes(box_history, affine[0]) # Clipped final boxes. (boxes, box_history) = affine_warp_boxes( affine[2], boxes, affine[1], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = affine[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) # Remove the bad boxes. boxes *= tf.cast(tf.expand_dims(cond, axis=-1), boxes.dtype) # Threshold the existing boxes. if filter_and_clip_boxes: if output_size is not None: boxes_ = bbox_ops.denormalize_boxes(boxes, output_size) box_history_ = bbox_ops.denormalize_boxes(box_history, output_size) inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh) else: inds = boxes_candidates( boxes, box_history, wh_thr=0.0, area_thr=area_thresh) # Select and gather the good boxes. if shuffle_boxes: inds = tf.random.shuffle(inds, seed=seed) else: inds = bbox_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, inds) return boxes, inds