def _parse_eval_data(self, data): """Generates images and labels that are usable for model evaluation. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = tf.cast(data['image'], dtype=tf.float32) boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] image_shape = tf.shape(input=image)[0:2] # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, [self._output_height, self._output_width], padded_size=[self._output_height, self._output_width], aug_scale_min=1.0, aug_scale_max=1.0) unpad_image_shape = tf.cast(tf.shape(image), tf.float32) # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) labels = self._build_label(unpad_image_shape=unpad_image_shape, boxes=boxes, classes=classes, image_info=image_info, data=data) if self._bgr_ordering: red, green, blue = tf.unstack(image, num=3, axis=2) image = tf.stack([blue, green, red], axis=2) image = preprocess_ops.normalize_image(image=image, offset=self._channel_means, scale=self._channel_stds) image = tf.cast(image, self._dtype) return image, labels
def _parse_eval_data(self, data): """Parses data for evaluation. !!! All augmentations and transformations are on bboxes with format (ymin, xmin, ymax, xmax). Required to do the appropriate transformations. !!! Images are supposed to be in RGB format """ image, boxes = data['image'], data['boxes'] image, image_info = preprocess_ops.resize_and_crop_image( image, self._input_size[:2], self._input_size[:2], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max, preserve_aspect_ratio=self._preserve_aspect_ratio) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) image = preprocess_ops.normalize_image(image, offset=MEAN_RGB, scale=STDDEV_RGB) image = tf.cast(image, dtype=self._dtype) boxes = tf.clip_by_value(boxes, 0, self._input_size[0] - 1) bbox_labels = yolo_box_ops.yxyx_to_xcycwh(boxes) bbox_labels = tf.concat([bbox_labels, data['classes'][:, tf.newaxis]], axis=-1) labels, bbox_labels = yolo_ops.preprocess_true_boxes( bboxes=bbox_labels, train_output_sizes=self.train_output_sizes, anchor_per_scale=self.anchor_per_scale, num_classes=self.num_classes, max_bbox_per_scale=self.max_bbox_per_scale, strides=self.strides, anchors=self.anchors) targets = {'labels': labels, 'bboxes': bbox_labels} return image, targets
def _parse_train_data(self, data): """Parses data for training and evaluation. !!! All augmentations and transformations are on bboxes with format (ymin, xmin, ymax, xmax). Required to do the appropriate transformations. !!! Images are supposed to be in RGB format """ image, boxes = data['image'], data['boxes'] # Execute RandAugment first as some ops require uint8 colors if self._augmenter is not None: image = self._augmenter.distort(image) if self._aug_rand_hflip: image, boxes = yolo_ops.random_horizontal_flip(image, boxes) image, image_info = preprocess_ops.resize_and_crop_image( image, self._input_size[:2], self._input_size[:2], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max, preserve_aspect_ratio=self._preserve_aspect_ratio) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) if self._aug_jitter_im != 0.0: image, boxes = yolo_ops.random_translate(image, boxes, self._aug_jitter_im) if self._aug_jitter_boxes != 0.0: boxes = box_ops.jitter_boxes(boxes, self._aug_jitter_boxes) image = preprocess_ops.normalize_image(image, offset=MEAN_RGB, scale=STDDEV_RGB) image = tf.cast(image, dtype=self._dtype) boxes = tf.clip_by_value(boxes, 0, self._input_size[0] - 1) bbox_labels = yolo_box_ops.yxyx_to_xcycwh(boxes) bbox_labels = tf.concat([bbox_labels, data['classes'][:, tf.newaxis]], axis=-1) labels, bbox_labels = yolo_ops.preprocess_true_boxes( bboxes=bbox_labels, train_output_sizes=self.train_output_sizes, anchor_per_scale=self.anchor_per_scale, num_classes=self.num_classes, max_bbox_per_scale=self.max_bbox_per_scale, strides=self.strides, anchors=self.anchors) # TODO: Figure out why we need to fix the num BBOX if not there will be an error # https://github.com/whizzmobility/models/pull/61 # pad / limit to MAX_DISPLAY_BBOX boxes for constant size raw_bboxes = boxes num_bboxes = tf.shape(raw_bboxes)[0] if num_bboxes > MAX_DISPLAY_BBOX: raw_bboxes = raw_bboxes[:, :MAX_DISPLAY_BBOX] else: paddings = tf.stack([0, MAX_DISPLAY_BBOX - num_bboxes], axis=-1) paddings = tf.stack([paddings, [0, 0]], axis=0) raw_bboxes = tf.pad(raw_bboxes, paddings) targets = { 'labels': labels, 'bboxes': bbox_labels, 'raw_bboxes': raw_bboxes } return image, targets
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtruths = tf.shape(classes)[0] with tf.control_dependencies([num_groundtruths, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtruths), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = preprocess_ops.random_horizontal_flip( image, boxes, masks) else: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler(self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # Casts input image to self._dtype image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': anchor_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'gt_classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), } if self._include_mask: labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def _parse_eval_data(self, data): """Parses data for training and evaluation.""" groundtruths = {} classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] # If not empty, `attributes` is a dict of (name, ground_truth) pairs. # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. # TODO(xianzhi): support parsing attributes weights. attributes = data.get('groundtruth_attributes', {}) # Gets original image and its size. image = data['image'] image_shape = tf.shape(input=image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=1.0, aug_scale_max=1.0) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Assigns anchors. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, att_targets, cls_weights, box_weights) = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) # Casts input image to desired data type. image = tf.cast(image, dtype=self._dtype) # Sets up groundtruth data for evaluation. groundtruths = { 'source_id': data['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_classes']), 'image_info': image_info, 'boxes': box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape), 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), } if 'groundtruth_attributes' in data: groundtruths['attributes'] = data['groundtruth_attributes'] groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': anchor_boxes, 'cls_weights': cls_weights, 'box_weights': box_weights, 'image_info': image_info, 'groundtruths': groundtruths, } if att_targets: labels['attribute_targets'] = att_targets return image, labels
def _parse_train_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] # If not empty, `attributes` is a dict of (name, ground_truth) pairs. # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. # TODO(xianzhi): support parsing attributes weights. attributes = data.get('groundtruth_attributes', {}) is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtrtuhs = tf.shape(input=classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( pred=tf.greater(tf.size(input=is_crowds), 0), true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0], false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf. int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Gets original image. image = data['image'] # Apply autoaug or randaug. if self._augmenter is not None: image, boxes = self._augmenter.distort_with_boxes(image, boxes) image_shape = tf.shape(input=image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) for k, v in attributes.items(): attributes[k] = tf.gather(v, indices) # Assigns anchors. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, att_targets, cls_weights, box_weights) = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) # Casts input image to desired data type. image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': anchor_boxes, 'cls_weights': cls_weights, 'box_weights': box_weights, 'image_info': image_info, } if att_targets: labels['attribute_targets'] = att_targets return image, labels
def _parse_train_data(self, data): """Parses data for training and evaluation.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtrtuhs = tf.shape(input=classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( pred=tf.greater(tf.size(input=is_crowds), 0), true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0], false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf. int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(input=image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) # Assigns anchors. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.AnchorLabeler(self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, cls_weights, box_weights) = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.expand_dims(classes, axis=1)) # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': anchor_boxes, 'cls_weights': cls_weights, 'box_weights': box_weights, 'image_info': image_info, } return image, labels
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. We use random flip, random scaling (between 0.6 to 1.3), cropping, and color jittering as data augmentation Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = tf.cast(data['image'], dtype=tf.float32) boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] image_shape = tf.shape(input=image)[0:2] if self._aug_rand_hflip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Image augmentation if not self._odapi_augmentation: # Color and lighting jittering if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.02) if self._aug_rand_contrast: image = tf.image.random_contrast(image=image, lower=0.8, upper=1.25) if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.8, upper=1.25) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.2) image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, [self._output_height, self._output_width], padded_size=[self._output_height, self._output_width], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) unpad_image_shape = tf.cast(tf.shape(image), tf.float32) # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes( boxes, image_scale, image_info[1, :], offset) else: # Color and lighting jittering if self._aug_rand_hue: image = cn_prep_ops.random_adjust_hue(image=image, max_delta=.02) if self._aug_rand_contrast: image = cn_prep_ops.random_adjust_contrast(image=image, min_delta=0.8, max_delta=1.25) if self._aug_rand_saturation: image = cn_prep_ops.random_adjust_saturation(image=image, min_delta=0.8, max_delta=1.25) if self._aug_rand_brightness: image = cn_prep_ops.random_adjust_brightness(image=image, max_delta=.2) sc_image, sc_boxes, classes = cn_prep_ops.random_square_crop_by_scale( image=image, boxes=boxes, labels=classes, scale_min=self._aug_scale_min, scale_max=self._aug_scale_max) image, unpad_image_shape = cn_prep_ops.resize_to_range( image=sc_image, min_dimension=self._output_width, max_dimension=self._output_width, pad_to_max_dimension=True) preprocessed_shape = tf.cast(tf.shape(image), tf.float32) unpad_image_shape = tf.cast(unpad_image_shape, tf.float32) im_box = tf.stack([ 0.0, 0.0, preprocessed_shape[0] / unpad_image_shape[0], preprocessed_shape[1] / unpad_image_shape[1] ]) realigned_bboxes = box_list_ops.change_coordinate_frame( boxlist=box_list.BoxList(sc_boxes), window=im_box) valid_boxes = box_list_ops.assert_or_prune_invalid_boxes( realigned_bboxes.get()) boxes = box_list_ops.to_absolute_coordinates( boxlist=box_list.BoxList(valid_boxes), height=self._output_height, width=self._output_width).get() image_info = tf.stack([ tf.cast(image_shape, dtype=tf.float32), tf.constant([self._output_height, self._output_width], dtype=tf.float32), tf.cast(tf.shape(sc_image)[0:2] / image_shape, dtype=tf.float32), tf.constant([0., 0.]) ]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) labels = self._build_label(unpad_image_shape=unpad_image_shape, boxes=boxes, classes=classes, image_info=image_info, data=data) if self._bgr_ordering: red, green, blue = tf.unstack(image, num=3, axis=2) image = tf.stack([blue, green, red], axis=2) image = preprocess_ops.normalize_image(image=image, offset=self._channel_means, scale=self._channel_stds) image = tf.cast(image, self._dtype) return image, labels
def preprocess(self, inputs): """Preprocess COCO for DETR.""" image = inputs['image'] boxes = inputs['objects']['bbox'] classes = inputs['objects']['label'] + 1 is_crowd = inputs['objects']['is_crowd'] image = preprocess_ops.normalize_image(image) if self._params.is_training: image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) do_crop = tf.greater(tf.random.uniform([]), 0.5) if do_crop: # Rescale boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2]) index = tf.random.categorical(tf.zeros([1, 3]), 1)[0] scales = tf.gather([400.0, 500.0, 600.0], index, axis=0) short_side = scales[0] image, image_info = preprocess_ops.resize_image(image, short_side) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Do croping shape = tf.cast(image_info[1], dtype=tf.int32) h = tf.random.uniform( [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32) w = tf.random.uniform( [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32) i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32) j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32) image = tf.image.crop_to_bounding_box(image, i, j, h, w) boxes = tf.clip_by_value( (boxes[..., :] * tf.cast( tf.stack([shape[0], shape[1], shape[0], shape[1]]), dtype=tf.float32) - tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) / tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0) scales = tf.constant( self._params.resize_scales, dtype=tf.float32) index = tf.random.categorical(tf.zeros([1, 11]), 1)[0] scales = tf.gather(scales, index, axis=0) else: scales = tf.constant([self._params.resize_scales[-1]], tf.float32) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) gt_boxes = boxes short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._params.output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box( image, 0, 0, self._params.output_size[0], self._params.output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size( classes, self._params.max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._params.max_num_boxes) } if not self._params.is_training: labels.update({ 'id': inputs['image/id'], 'image_info': image_info, 'is_crowd': preprocess_ops.clip_or_pad_to_fixed_size( is_crowd, self._params.max_num_boxes), 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size( gt_boxes, self._params.max_num_boxes), }) return image, labels